From b0aa9812e6b25f86127d552aa0bcd283d4f02f01 Mon Sep 17 00:00:00 2001 From: romnnn Date: Thu, 16 Nov 2023 14:57:51 +0100 Subject: [PATCH] scale up benchmarks --- Cargo.lock | 351 +- WIP.md | 16 +- gpucachesim/benchmarks.py | 10 +- gpucachesim/stats/__init__.py | 1784 ++- .../nvprof.simple_matrixmul.cycles.pdf | Bin 18678 -> 18114 bytes .../nvprof.simple_matrixmul.exec_time_sec.pdf | Bin 18700 -> 17935 bytes .../nvprof.simple_matrixmul.input_id.pdf | Bin 18224 -> 17863 bytes .../nvprof.simple_matrixmul.instructions.pdf | Bin 17515 -> 17041 bytes .../nvprof.simple_matrixmul.l1_accesses.pdf | Bin 17968 -> 17527 bytes ...of.simple_matrixmul.l1_global_hit_rate.pdf | Bin 18234 -> 17757 bytes .../nvprof.simple_matrixmul.l1_hit_rate.pdf | Bin 19210 -> 18702 bytes ...rof.simple_matrixmul.l1_local_hit_rate.pdf | Bin 18340 -> 17577 bytes .../nvprof.simple_matrixmul.l2_accesses.pdf | Bin 18193 -> 17626 bytes ...prof.simple_matrixmul.l2_read_hit_rate.pdf | Bin 18623 -> 18164 bytes .../nvprof.simple_matrixmul.l2_reads.pdf | Bin 18456 -> 17906 bytes ...rof.simple_matrixmul.l2_write_hit_rate.pdf | Bin 18174 -> 17917 bytes .../nvprof.simple_matrixmul.l2_writes.pdf | Bin 17856 -> 17565 bytes ...of.simple_matrixmul.mean_blocks_per_sm.pdf | Bin 0 -> 16804 bytes .../nvprof.simple_matrixmul.num_blocks.pdf | Bin 0 -> 17060 bytes test-apps/test-apps-materialized.yml | 11520 ++++++++++++---- test-apps/test-apps.yml | 9 +- 21 files changed, 10181 insertions(+), 3509 deletions(-) create mode 100644 plot/validation/nvprof.simple_matrixmul.mean_blocks_per_sm.pdf create mode 100644 plot/validation/nvprof.simple_matrixmul.num_blocks.pdf diff --git a/Cargo.lock b/Cargo.lock index 880ba684..8d30f054 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -198,35 +198,6 @@ dependencies = [ "futures-core", ] -[[package]] -name = "async-executor" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1da3ae8dabd9c00f453a329dfe1fb28da3c0a72e2478cdcd93171740c20499" -dependencies = [ - "async-lock", - "async-task", - "concurrent-queue", - "fastrand 2.0.1", - "futures-lite", - "slab", -] - -[[package]] -name = "async-global-executor" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" -dependencies = [ - "async-channel", - "async-executor", - "async-io", - "async-lock", - "blocking", - "futures-lite", - "once_cell", -] - [[package]] name = "async-io" version = "1.13.0" @@ -293,53 +264,15 @@ dependencies = [ [[package]] name = "async-ssh2-lite" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f6414df0399afb863951edded534013e4c74cc4d7ab3b4a92b608986027ec7" -dependencies = [ - "async-io", - "futures-util", - "ssh2", -] - -[[package]] -name = "async-std" -version = "1.12.0" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +checksum = "6cb43eaa75050ebe27dfd16e6de7078d9796a251f03c77d7a24c05aa9037c29b" dependencies = [ - "async-channel", - "async-global-executor", - "async-io", - "async-lock", - "crossbeam-utils", - "futures-channel", - "futures-core", - "futures-io", - "futures-lite", - "gloo-timers", - "kv-log-macro", - "log", - "memchr", - "once_cell", - "pin-project-lite", - "pin-utils", - "slab", - "wasm-bindgen-futures", -] - -[[package]] -name = "async-std-resolver" -version = "0.20.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf3e776afdf3a2477ef4854b85ba0dff3bd85792f685fb3c68948b4d304e4f0" -dependencies = [ - "async-std", "async-trait", - "futures-io", "futures-util", - "pin-utils", - "trust-dns-resolver", + "libssh2-sys", + "ssh2", + "tokio", ] [[package]] @@ -980,12 +913,6 @@ dependencies = [ "ordered-float", ] -[[package]] -name = "data-encoding" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" - [[package]] name = "dialoguer" version = "0.11.0" @@ -1021,26 +948,6 @@ dependencies = [ "crypto-common", ] -[[package]] -name = "dirs" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "dotenv" version = "0.15.0" @@ -1089,18 +996,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "enum-as-inner" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "570d109b813e904becc80d8d5da38376818a143348413f7149f1340fe04754d4" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "env_logger" version = "0.10.0" @@ -1421,18 +1316,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" -[[package]] -name = "gloo-timers" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" -dependencies = [ - "futures-channel", - "futures-core", - "js-sys", - "wasm-bindgen", -] - [[package]] name = "gpucachesim" version = "0.1.0" @@ -1587,17 +1470,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "hostname" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" -dependencies = [ - "libc", - "match_cfg", - "winapi", -] - [[package]] name = "html-escape" version = "0.2.13" @@ -1727,17 +1599,6 @@ dependencies = [ "cc", ] -[[package]] -name = "idna" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "idna" version = "0.4.0" @@ -1836,18 +1697,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "ipconfig" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7e2f18aece9709094573a9f24f483c4f65caa4298e2f7ae1b71cc65d853fad7" -dependencies = [ - "socket2 0.3.19", - "widestring", - "winapi", - "winreg 0.6.2", -] - [[package]] name = "ipnet" version = "2.8.0" @@ -1907,15 +1756,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "kv-log-macro" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" -dependencies = [ - "log", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -1979,12 +1819,6 @@ dependencies = [ "cc", ] -[[package]] -name = "linked-hash-map" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" - [[package]] name = "linux-raw-sys" version = "0.3.8" @@ -2012,30 +1846,6 @@ name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -dependencies = [ - "value-bag", -] - -[[package]] -name = "lru-cache" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e24f1ad8321ca0e8a1e0ac13f23cb668e6f5466c2c57319f6a5cf1cc8e3b1c" -dependencies = [ - "linked-hash-map", -] - -[[package]] -name = "match_cfg" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" - -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" [[package]] name = "matrixmultiply" @@ -2589,15 +2399,6 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" -[[package]] -name = "plain_path" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1b940aa8e0562ece01eb12a9731bb8f6f0325c2c97c8629f852504f01d4537" -dependencies = [ - "dirs", -] - [[package]] name = "playground" version = "0.1.0" @@ -2808,12 +2609,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - [[package]] name = "quote" version = "1.0.33" @@ -2909,17 +2704,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom", - "redox_syscall 0.2.16", - "thiserror", -] - [[package]] name = "regex" version = "1.10.0" @@ -2959,10 +2743,17 @@ checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d" name = "remote" version = "0.1.0" dependencies = [ + "async-ssh2-lite", + "async-trait", "clap", "color-eyre", "dotenv", - "ssh_jumper", + "env_logger", + "futures", + "itertools 0.10.5", + "log", + "ssh2", + "strum", "thiserror", "tokio", ] @@ -3007,17 +2798,7 @@ dependencies = [ "wasm-bindgen-futures", "web-sys", "webpki-roots", - "winreg 0.50.0", -] - -[[package]] -name = "resolv-conf" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52e44394d2086d010551b14b53b1f24e31647570cd1deb0379e2c21b329aba00" -dependencies = [ - "hostname", - "quick-error", + "winreg", ] [[package]] @@ -3431,17 +3212,6 @@ dependencies = [ "syn 2.0.38", ] -[[package]] -name = "socket2" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" -dependencies = [ - "cfg-if", - "libc", - "winapi", -] - [[package]] name = "socket2" version = "0.4.9" @@ -3489,33 +3259,6 @@ dependencies = [ "parking_lot 0.11.2", ] -[[package]] -name = "ssh_jumper" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fce1cdbdcecdd805b9a59de5d2889d81a091cd92b4a55978a0e9107efa4186da" -dependencies = [ - "async-io", - "async-ssh2-lite", - "async-std-resolver", - "futures", - "plain_path", - "ssh_jumper_model", - "tokio", -] - -[[package]] -name = "ssh_jumper_model" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e272e5de556b6db9291f189f06ab11d3445b0d9ef6317bcece364e02aae87bbe" -dependencies = [ - "async-ssh2-lite", - "async-std-resolver", - "plain_path", - "tokio", -] - [[package]] name = "stats" version = "0.1.0" @@ -3933,49 +3676,6 @@ dependencies = [ "tracing-log", ] -[[package]] -name = "trust-dns-proto" -version = "0.20.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca94d4e9feb6a181c690c4040d7a24ef34018d8313ac5044a61d21222ae24e31" -dependencies = [ - "async-trait", - "cfg-if", - "data-encoding", - "enum-as-inner", - "futures-channel", - "futures-io", - "futures-util", - "idna 0.2.3", - "ipnet", - "lazy_static", - "log", - "rand", - "smallvec", - "thiserror", - "tinyvec", - "url", -] - -[[package]] -name = "trust-dns-resolver" -version = "0.20.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecae383baad9995efaa34ce8e57d12c3f305e545887472a492b838f4b5cfb77a" -dependencies = [ - "cfg-if", - "futures-util", - "ipconfig", - "lazy_static", - "log", - "lru-cache", - "parking_lot 0.11.2", - "resolv-conf", - "smallvec", - "thiserror", - "trust-dns-proto", -] - [[package]] name = "try-lock" version = "0.2.4" @@ -4058,7 +3758,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5" dependencies = [ "form_urlencoded", - "idna 0.4.0", + "idna", "percent-encoding", ] @@ -4149,12 +3849,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "value-bag" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" - [[package]] name = "vcpkg" version = "0.2.15" @@ -4312,12 +4006,6 @@ dependencies = [ "rustix 0.38.18", ] -[[package]] -name = "widestring" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c168940144dd21fd8046987c16a46a33d5fc84eec29ef9dcddc2ac9e31526b7c" - [[package]] name = "winapi" version = "0.3.9" @@ -4490,15 +4178,6 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" -[[package]] -name = "winreg" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2986deb581c4fe11b621998a5e53361efe6b48a151178d0cd9eeffa4dc6acc9" -dependencies = [ - "winapi", -] - [[package]] name = "winreg" version = "0.50.0" diff --git a/WIP.md b/WIP.md index 958eee7d..12708d57 100644 --- a/WIP.md +++ b/WIP.md @@ -67,17 +67,10 @@ The Maxwell/Pascal L1 data cache had similar tag stage performance but local and - TODO: - - overall results in parallel table - - parallel remove .0 where possible - - DONE: bold font for best results in parallel table - - DONE: remove parallel execution from exec driven simulate (only baseline) - - DONE: fix parallel plot table (multiple kernel launch ids) - - - DONE: compute blocks per sm metric for parallel, maybe we just need to scale up the workloads + - validate: add remote connection to das5 and das6 - make the basic plots look good for pascal pchase and write it down - run the same for older fermi or maxwell gpu on das5 and write it down - - GIVE UP: have another go at ampere - plot: compare mem only simulation and trace reconstruction error - plot: compute overall correlations for all metrics @@ -88,6 +81,13 @@ The Maxwell/Pascal L1 data cache had similar tag stage performance but local and - check if any line ever has different hit_cluster, otherwise makes no sense - this prob wont be interesting for l1, but maybe l2? + - DONE: overall results in parallel table + - GIVE UP: have another go at ampere + - DONE: parallel remove .0 where possible + - DONE: bold font for best results in parallel table + - DONE: remove parallel execution from exec driven simulate (only baseline) + - DONE: fix parallel plot table (multiple kernel launch ids) + - DONE: compute blocks per sm metric for parallel, maybe we just need to scale up the workloads - DONE: simulator: l1 latency should only affect HITS ==> l1 return queue - GIVE UP: research: try to understand the l1 tex cache hit rate nvprof metric - DONE: connect to das6 diff --git a/gpucachesim/benchmarks.py b/gpucachesim/benchmarks.py index 6593aa91..d7c98647 100644 --- a/gpucachesim/benchmarks.py +++ b/gpucachesim/benchmarks.py @@ -303,12 +303,16 @@ def construct_playground_simulate_target_config(self, node): class Benchmarks: + path: Path config: Config - def __init__(self, path: os.PathLike) -> None: + def __init__(self, path: typing.Optional[os.PathLike]) -> None: """load the materialized benchmark config""" - - with open(path or DEFAULT_BENCH_FILE, "rb") as f: + if path is None: + self.path = DEFAULT_BENCH_FILE + else: + self.path = Path(path) + with open(self.path, "rb") as f: benchmarks = yaml.load(f, Loader=BenchmarkLoader) self.config = benchmarks["config"] diff --git a/gpucachesim/stats/__init__.py b/gpucachesim/stats/__init__.py index acbf8ba9..13ca9425 100644 --- a/gpucachesim/stats/__init__.py +++ b/gpucachesim/stats/__init__.py @@ -20,7 +20,13 @@ import gpucachesim.plot as plot import gpucachesim.utils as utils -from gpucachesim.benchmarks import SIMULATE_FUNCTIONAL_CONFIG_COLS, Target, Benchmarks, GPUConfig, REPO_ROOT_DIR +from gpucachesim.benchmarks import ( + SIMULATE_FUNCTIONAL_CONFIG_COLS, + Target, + Benchmarks, + GPUConfig, + REPO_ROOT_DIR, +) # suppress scientific notation by setting float_format @@ -42,6 +48,7 @@ def main(): # ctx.ensure_object(dict) pass + def aggregate_benchmark_results( sim_df: pd.DataFrame, bench_name: str, @@ -118,11 +125,12 @@ def aggregate_benchmark_results( (128, 128, 128), (32, 64, 128), (128, 32, 32), - (32, 1024, 32), - (32, 2048, 32), - (32, 4096, 32), + (512, 32, 512), + # (32, 1024, 32), + # (32, 2048, 32), + # (32, 4096, 32), ], - columns=["input_m", "input_n", "input_p"] + columns=["input_m", "input_n", "input_p"], ) # print(subset.index) # print(selected_df.index) @@ -158,7 +166,9 @@ def aggregate_benchmark_results( ] ) - group_cols = benchmarks.BENCH_TARGET_INDEX_COLS + ["kernel_name", "run"] + input_cols + group_cols = ( + benchmarks.BENCH_TARGET_INDEX_COLS + ["kernel_name", "run"] + input_cols + ) # print(selected_df.index) # non_numeric_cols = sorted(selected_df.select_dtypes(include=["object"]).columns.tolist()) @@ -170,13 +180,16 @@ def aggregate_benchmark_results( **benchmarks.NON_NUMERIC_COLS, } aggregations = { - col: agg for col, agg in aggregations.items() + col: agg + for col, agg in aggregations.items() if col in selected_df and col not in group_cols } pprint(aggregations) # print(sorted(selected_df.columns.tolist())) - per_kernel = selected_df.groupby(group_cols, dropna=False).agg(aggregations).reset_index() + per_kernel = ( + selected_df.groupby(group_cols, dropna=False).agg(aggregations).reset_index() + ) # print(sorted(per_kernel.columns.tolist())) # selected_df.groupby(group_cols, dropna=False)[STAT_COLS].mean().reset_index() @@ -191,7 +204,8 @@ def aggregate_benchmark_results( **benchmarks.NON_NUMERIC_COLS, } aggregations = { - col: agg for col, agg in aggregations.items() + col: agg + for col, agg in aggregations.items() if col in per_kernel and not col in group_cols } pprint(aggregations) @@ -208,35 +222,647 @@ def aggregate_benchmark_results( # stat_cols = set(averaged.columns) - set(["benchmark"]) - set(input_cols) per_target_pivoted = per_target.pivot( - index=["benchmark"] + input_cols, columns="target", # values=STAT_COLS + index=["benchmark"] + input_cols, + columns="target", # values=STAT_COLS ) # per_target = averaged.set_index(["target", "benchmark"] + input_cols) return per_kernel, per_target_pivoted - def different_cols(df): return [col for col in df.columns if len(df[col].value_counts()) > 1] +class ParallelTableRow(typing.NamedTuple): + metric: str + threads: int + serial_value: typing.Optional[typing.Tuple[float, typing.Union[float, int, str]]] + det_value: typing.Optional[typing.Tuple[float, typing.Union[float, int, str]]] + nondet_values: typing.Sequence[typing.Tuple[float, typing.Union[float, int, str]]] + + def values(self): + values = [] + if self.serial_value is not None: + values.append(self.serial_value[0]) + if self.det_value is not None: + values.append(self.det_value[0]) + values += [v[0] for v in self.nondet_values] + return values + + +def build_parallel_table_rows( + df: pd.DataFrame, + num_benchmarks, + all_benchmarks, + thousands_round_to=1, + variable_precision=True, +) -> typing.Sequence[ParallelTableRow]: + interleave_n = list(itertools.product([False, True], [5, 10])) + table_rows: typing.Sequence[ParallelTableRow] = [] + + for threads in [4, 8]: + threads_mask = df["input_threads_parallel"] == threads + det_mask = df["input_mode_parallel"] == "deterministic" + nondet_no_interleave_mask = df["input_mode_parallel"] == "nondeterministic" + nondet_interleave_mask = ( + df["input_mode_parallel"] == "nondeterministic_interleave" + ) + # print([m.sum() for m in [ + # mask, threads_mask, det_mask, nondet_no_interleave_mask, nondet_interleave_mask + # ]]) + + det = df[threads_mask & det_mask] + # print( + # det[ + # bench_input_cols + # + [ + # "input_threads_parallel", + # "exec_time_sec_parallel", + # "input_id_parallel", + # "input_id_serial", + # # "dram_reads_serial", + # # "dram_reads_parallel", + # # "dram_reads_rel_err", + # "dram_writes_serial", + # "dram_writes_parallel", + # "dram_writes_rel_err", + # ] + different_cols(det) + # ] + # ) + print("===") + nondet_no_interleave = df[threads_mask & nondet_no_interleave_mask] + nondet_interleave = df[threads_mask & nondet_interleave_mask] + + assert len(det) == num_benchmarks + assert len(nondet_no_interleave) == 2 * num_benchmarks + assert len(nondet_interleave) == 2 * num_benchmarks + # assert ( + # len( + # df[[ + # "exec_time_sec_serial", + # "cycles_serial", + # "input_id_serial", + # ]].drop_duplicates() + # ) + # == 1 + # ) + + # exec time (speedup) + serial_exec_time = df.loc[threads_mask, "exec_time_sec_serial"].values[0] + det_exec_time = det["exec_time_sec_parallel"].values[0] + det_speedup = det["exec_time_sec_speedup"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + nondet_exec_time = nondet["exec_time_sec_parallel"].values[0] + nondet_speedup = nondet["exec_time_sec_speedup"].values[0] + if all_benchmarks: + nondet_values.append( + ( + nondet_speedup, + "${}x$".format( + plot.round_to_precision( + nondet_speedup, + round_to=1, + variable_precision=variable_precision, + ) + ), + ) + ) + + else: + nondet_values.append( + ( + nondet_exec_time, + "${:>3.1f}s~({}x)$".format( + nondet_exec_time, + plot.round_to_precision( + nondet_speedup, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else (serial_exec_time, "${:>3.1f}s$".format(serial_exec_time)) + ) + if all_benchmarks: + det_value = ( + det_speedup, + "${}x$".format( + plot.round_to_precision( + det_speedup, round_to=1, variable_precision=variable_precision + ) + ), + ) + else: + det_value = ( + det_exec_time, + "${:>3.1f}s~({}x)$".format( + det_exec_time, + plot.round_to_precision( + det_speedup, round_to=1, variable_precision=variable_precision + ), + ), + ) + table_rows.append( + ParallelTableRow( + metric=r"exec\\time", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + + # cycles (rel err) + serial_cycles = int(df.loc[threads_mask, "cycles_serial"].values[0]) + det_cycles = int(det["cycles_parallel"].values[0]) + det_rel_err = det["cycles_rel_err"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + + nondet_cycles = int(nondet["cycles_parallel"].values[0]) + nondet_rel_err = nondet["cycles_rel_err"].values[0] + if all_benchmarks: + nondet_values.append( + ( + nondet_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ) + ), + ) + ) + else: + nondet_values.append( + ( + nondet_cycles, + "${} ({}\\%)$".format( + plot.human_format_thousands( + nondet_cycles, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else ( + serial_cycles, + "${}$".format( + plot.human_format_thousands( + serial_cycles, + round_to=thousands_round_to, + variable_precision=variable_precision, + ) + ), + ) + ) + if all_benchmarks: + det_value = ( + 100.0 * det_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ) + ), + ) + else: + det_value = ( + det_cycles, + "${} ({}\\%)$".format( + plot.human_format_thousands( + det_cycles, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + table_rows.append( + ParallelTableRow( + metric="cycles", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + + # l1 data hit rate (rel err) + serial_l1_hit_rate = df.loc[threads_mask, "l1_hit_rate_serial"].values[0] + det_l1_hit_rate = det["l1_hit_rate_parallel"].values[0] + det_rel_err = det["l1_hit_rate_rel_err"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + + nondet_l1_hit_rate = nondet["l1_hit_rate_parallel"].values[0] + nondet_rel_err = nondet["l1_hit_rate_rel_err"].values[0] + if all_benchmarks: + nondet_values.append( + ( + 100.0 * nondet_rel_err, + "{}\\%$".format( + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + else: + nondet_values.append( + ( + 100.0 * nondet_l1_hit_rate, + "${}\\%~({}\\%)$".format( + plot.round_to_precision( + 100.0 * nondet_l1_hit_rate, + round_to=1, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else ( + 100.0 * serial_l1_hit_rate, + "${:>2.1f}\\%$".format(100.0 * serial_l1_hit_rate), + ) + ) + if all_benchmarks: + det_value = ( + 100.0 * det_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + else: + det_value = ( + 100.0 * det_l1_hit_rate, + "${}\\%~({}\\%)$".format( + plot.round_to_precision( + 100.0 * det_l1_hit_rate, + round_to=1, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + + table_rows.append( + ParallelTableRow( + metric=r"L1D\\hit rate", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + + # l2 data hit rate (rel err) + serial_l2_hit_rate = df.loc[threads_mask, "l2_hit_rate_serial"].values[0] + det_l2_hit_rate = det["l2_hit_rate_parallel"].values[0] + det_rel_err = det["l2_hit_rate_rel_err"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + + nondet_l2_hit_rate = nondet["l2_hit_rate_parallel"].values[0] + nondet_rel_err = nondet["l2_hit_rate_rel_err"].values[0] + if all_benchmarks: + nondet_values.append( + ( + 100.0 * nondet_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + else: + nondet_values.append( + ( + 100.0 * nondet_l2_hit_rate, + "${}\\%~({}\\%)$".format( + plot.round_to_precision( + 100.0 * nondet_l2_hit_rate, + round_to=1, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else ( + 100.0 * serial_l2_hit_rate, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * serial_l2_hit_rate, + round_to=1, + variable_precision=variable_precision, + ) + ), + ) + ) + if all_benchmarks: + det_value = ( + 100.0 * det_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + else: + det_value = ( + 100.0 * det_l2_hit_rate, + "${}\\%~({}\\%)$".format( + plot.round_to_precision( + 100.0 * det_l2_hit_rate, + round_to=1, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + table_rows.append( + ParallelTableRow( + metric=r"L2D\\hit rate", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + + # dram reads (rel err) + serial_dram_reads = int(df.loc[threads_mask, "dram_reads_serial"].values[0]) + det_dram_reads = int(det["dram_reads_parallel"].values[0]) + det_rel_err = det["dram_reads_rel_err"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + + nondet_dram_reads = int(nondet["dram_reads_parallel"].values[0]) + nondet_rel_err = nondet["dram_reads_rel_err"].values[0] + if all_benchmarks: + nondet_values.append( + ( + nondet_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + else: + nondet_values.append( + ( + nondet_dram_reads, + "${} ({}\\%)$".format( + plot.human_format_thousands( + nondet_dram_reads, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else ( + serial_dram_reads, + "${}$".format( + plot.human_format_thousands( + serial_dram_reads, + round_to=thousands_round_to, + variable_precision=variable_precision, + ) + ), + ) + ) + if all_benchmarks: + det_value = ( + 100.0 * det_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + else: + det_value = ( + det_dram_reads, + "${} ({}\\%)$".format( + plot.human_format_thousands( + det_dram_reads, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + + table_rows.append( + ParallelTableRow( + metric=r"DRAM\\reads", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + + # dram writes (rel err) + serial_dram_writes = int(df.loc[threads_mask, "dram_writes_serial"].values[0]) + det_dram_writes = int(det["dram_writes_parallel"].values[0]) + det_rel_err = det["dram_writes_rel_err"].values[0] + nondet_values = [] + for interleave, n in interleave_n: + nondet = nondet_interleave if interleave else nondet_no_interleave + nondet = nondet[nondet["input_run_ahead_parallel"] == n] + + nondet_dram_writes = int(nondet["dram_writes_parallel"].values[0]) + nondet_rel_err = nondet["dram_writes_rel_err"].values[0] + if all_benchmarks: + nondet_values.append( + ( + 100.0 * nondet_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + else: + nondet_values.append( + ( + nondet_dram_writes, + "${} ({}\\%)$".format( + plot.human_format_thousands( + nondet_dram_writes, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * nondet_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + ) + + serial_value = ( + None + if all_benchmarks + else ( + serial_dram_writes, + "${}$".format( + plot.human_format_thousands( + serial_dram_writes, + round_to=thousands_round_to, + variable_precision=variable_precision, + ) + ), + ) + ) + if all_benchmarks: + det_value = ( + 100.0 * det_rel_err, + "${}\\%$".format( + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + else: + det_value = ( + det_dram_writes, + "${} ({}\\%)$".format( + plot.human_format_thousands( + det_dram_writes, + round_to=thousands_round_to, + variable_precision=variable_precision, + ), + plot.round_to_precision( + 100.0 * det_rel_err, + round_to=1, + variable_precision=variable_precision, + ), + ), + ) + table_rows.append( + ParallelTableRow( + metric=r"DRAM\\writes", + threads=threads, + serial_value=serial_value, + det_value=det_value, + nondet_values=nondet_values, + ) + ) + return table_rows + + @main.command() # @click.pass_context +@click.option("--path", help="Path to materialized benchmark config") @click.option("--bench", "bench_name", help="Benchmark name") @click.option("--nsight", "nsight", type=bool, is_flag=True, help="use nsight") -def parallel_plot(bench_name, nsight): - # load the materialized benchmark config +def parallel_plot(bench_name, path, nsight): profiler = "nsight" if nsight else "nvprof" - if bench_name is None: - stats_file = REPO_ROOT_DIR / "results/combined.stats.{}.csv".format(profiler) - else: - stats_file = REPO_ROOT_DIR / "results/combined.stats.{}.{}.csv".format( - profiler, bench_name - ) + all_benchmarks = bench_name is None - print("loading {}".format(stats_file)) - selected_df = pd.read_csv(stats_file, header=0) + selected_df = load_stats(bench_name=bench_name, profiler=profiler, path=path) selected_df = selected_df[selected_df["target"] == Target.Simulate.value] - selected_df = selected_df[selected_df["benchmark"] == bench_name] + if not all_benchmarks: + selected_df = selected_df[selected_df["benchmark"] == bench_name] + + num_benchmarks = len(selected_df["benchmark"].unique().tolist()) if not (selected_df["is_release_build"] == True).all(): print(color("WARNING: non release results:", fg="red")) @@ -246,7 +872,9 @@ def parallel_plot(bench_name, nsight): print("====") bench_cols = ["target", "benchmark"] - bench_input_cols = benchmarks.BENCHMARK_INPUT_COLS[bench_name] + bench_input_cols = ( + [] if all_benchmarks else benchmarks.BENCHMARK_INPUT_COLS[bench_name] + ) input_cols = benchmarks.SIMULATE_INPUT_COLS print(bench_input_cols) print(input_cols) @@ -263,10 +891,13 @@ def parallel_plot(bench_name, nsight): aggregations = { **{c: "mean" for c in sorted(serial.columns)}, + **{c: "first" for c in serial.columns if c.startswith("input_")}, **benchmarks.NON_NUMERIC_COLS, } aggregations = {col: agg for col, agg in aggregations.items() if col in serial} - aggregations = {col: agg for col, agg in aggregations.items() if col not in group_cols} + aggregations = { + col: agg for col, agg in aggregations.items() if col not in group_cols + } mean_serial = serial.groupby(group_cols).agg(aggregations).reset_index() metric_cols = ["cycles", "exec_time_sec", "l2_hit_rate", "l1_hit_rate"] @@ -302,14 +933,20 @@ def parallel_plot(bench_name, nsight): print( serial.loc[ serial["input_id"] == input_id, - bench_cols + ["kernel_launch_id"] + bench_input_cols + benchmarks.SIMULATE_INPUT_COLS, + bench_cols + + ["kernel_launch_id"] + + bench_input_cols + + benchmarks.SIMULATE_INPUT_COLS, ] ) print("parallel input", input_id) print( parallel.loc[ parallel["input_id"] == input_id, - bench_cols + ["kernel_launch_id"] + bench_input_cols + benchmarks.SIMULATE_INPUT_COLS, + bench_cols + + ["kernel_launch_id"] + + bench_input_cols + + benchmarks.SIMULATE_INPUT_COLS, ] ) break @@ -318,28 +955,37 @@ def parallel_plot(bench_name, nsight): # join based on input_cols, NOT based on mode joined = parallel.merge( serial, - on=bench_cols + ["kernel_launch_id"] + bench_input_cols + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS, + on=bench_cols + + ["kernel_launch_id"] + + bench_input_cols + + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS, how="left", suffixes=("_parallel", "_serial"), ) - print("joined={} parallel={} serial={}".format(joined.shape, parallel.shape, serial.shape)) + print( + "joined={} parallel={} serial={}".format( + joined.shape, parallel.shape, serial.shape + ) + ) assert joined.shape[0] == parallel.shape[0] assert "mean_blocks_per_sm_parallel" in joined assert "total_cores_parallel" in joined assert "cores_per_cluster_parallel" in joined - + if len(joined) == 0: raise ValueError("joined parallel and serial dataframe is empty") PREVIEW_COLS = sorted( - bench_cols - + ["kernel_launch_id"] - + bench_input_cols - + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS - + [c + "_parallel" for c in benchmarks.SIMULATE_EXECUTION_CONFIG_COLS] - + [c + "_parallel" for c in metric_cols] - + [c + "_serial" for c in metric_cols] - + ["input_id_serial", "input_id_parallel"] + list( + bench_cols + + ["kernel_launch_id"] + + bench_input_cols + + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS + + [c + "_parallel" for c in benchmarks.SIMULATE_EXECUTION_CONFIG_COLS] + + [c + "_parallel" for c in metric_cols] + + [c + "_serial" for c in metric_cols] + + ["input_id_serial", "input_id_parallel"] + ) ) if True: @@ -360,7 +1006,7 @@ def parallel_plot(bench_name, nsight): # ["benchmark"] + bench_input_cols + [c + "_parallel" for c in SIMULATE_EXECUTION_CONFIG_COLS] ].drop_duplicates() ) - + group_cols = sorted( bench_cols + bench_input_cols @@ -374,7 +1020,9 @@ def parallel_plot(bench_name, nsight): **{c + "_serial": agg for c, agg in benchmarks.NON_NUMERIC_COLS.items()}, } aggregations = {col: agg for col, agg in aggregations.items() if col in joined} - aggregations = {col: agg for col, agg in aggregations.items() if col not in group_cols} + aggregations = { + col: agg for col, agg in aggregations.items() if col not in group_cols + } if set(joined.columns.tolist()) - set(group_cols) != set(aggregations.keys()): pprint( @@ -392,7 +1040,7 @@ def rel_err(true_values, values): true_values = true_values.fillna(0.0) rel_err = (values - true_values).abs() / true_values rel_err = rel_err.fillna(0.0) - rel_err[rel_err == 0.0] = 0.0 + rel_err[rel_err == 0.0] = 0.0 return rel_err def rmse(true_values, values): @@ -454,7 +1102,7 @@ def mae(true_values, values): # grouped.apply(lambda df: print(df.reset_index()[ # bench_cols - # + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS + # + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS # + ["input_variant"] # + ["dram_writes_serial", "dram_writes_parallel"] # ])) @@ -536,20 +1184,26 @@ def mae(true_values, values): dict( name="matrixmul", inputs={ - **{"input_dtype": 32, "input_rows": 256}, + **{"input_dtype": 32, "input_rows": 512}, **functional_config, }, ), dict( name="simple_matrixmul", inputs={ - **{"input_dtype": 32, "input_m": 128, "input_n": 128, "input_p": 128}, + **{ + "input_dtype": 32, + "input_m": 512, + "input_n": 32, + "input_p": 512, + }, **functional_config, }, ), - ] + # print(aggregated[bench_input_cols].drop_duplicates()) + def compute_label(bench_config, df): benchmark = df["benchmark"] bench_input_cols = benchmarks.BENCHMARK_INPUT_COLS[benchmark] @@ -567,7 +1221,16 @@ def compute_label(bench_config, df): assert bench_config["inputs"]["input_cores_per_cluster"] == cores_per_cluster assert bench_config["inputs"]["input_num_clusters"] == num_clusters - print(df[["benchmark", "input_cores_per_cluster", "input_num_clusters", "total_cores_parallel"]]) + print( + df[ + [ + "benchmark", + "input_cores_per_cluster", + "input_num_clusters", + "total_cores_parallel", + ] + ] + ) assert total_cores == int(df[["total_cores_parallel"]].values[0]) match benchmark.lower(): @@ -607,384 +1270,527 @@ def compute_label(bench_config, df): ) return label - interleave_n = list(itertools.product([False, True], [5, 10])) + def write_table_row(row, bold_values=None): + if bold_values is None: + bold_values = set() + + def bold(v, formatted_v): + if v in bold_values: + formatted_v = formatted_v.strip() + is_math = formatted_v[0] == "$" and formatted_v[-1] == "$" + if is_math: + return r"\boldmath" + str(formatted_v) + else: + return r"\textbf{" + str(formatted_v) + "}" + return str(formatted_v) + + is_first_metric_row = row.threads == 4 + is_last_metric_row = row.threads == 8 + + table_row = "" + + # metric name + if is_first_metric_row: + table_row += r"\multirow{2}{*}{\shortstack[l]{" + str(row.metric) + r"}}" + + # threads + table_row += r" & $t=" + str(row.threads) + r"$ " + + # serial value + if row.serial_value is not None and is_first_metric_row: + table_row += ( + r" & \multirow{2}{*}{\shortstack[l]{" + + bold(row.serial_value[0], row.serial_value[1]) + + r"}} " + ) + else: + table_row += r" & " - table = "" + # deterministic value + if row.det_value is not None: + table_row += r" & " + bold(row.det_value[0], row.det_value[1]) + else: + table_row += r" & " - for bench_config in selected_benchmarks: - bench_inputs: typing.Dict[str, typing.Any] = bench_config["inputs"] - if not all(aggregated["benchmark"] == bench_config["name"]): - print(color("SKIP: want {} (have {})".format( - bench_config["name"], aggregated["benchmark"][0]), fg="red")) - continue + # nondeterministic value + for nondet_value, formatted_nondet_value in row.nondet_values: + table_row += r" & " + bold(nondet_value, formatted_nondet_value) + table_row += r" \\ " + if is_last_metric_row: + table_row += r" \hline " + table_row += "\n" + return table_row - print("==> {}".format(bench_config["name"])) - mask_cols = ["benchmark"] + list(bench_inputs.keys()) - mask_values = [bench_name] + list(bench_inputs.values()) + table = "" + # thousands_round_to = 1 + # variable_precision = True + + if all_benchmarks: + # mask_cols = ["benchmark"] + list(bench_inputs.keys()) + # mask_values = [bench_name] + list(bench_inputs.values()) # mask = aggregated["benchmark"] == bench_name # for col, value in zip(mask_cols, mask_values): # mask &= aggregated[col] == value # print((aggregated[mask_cols] == mask_values).sum(axis=0)) - - - mask = (aggregated[mask_cols] == mask_values).all(axis=1) - test_df = aggregated.loc[ - mask, - benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS - + bench_input_cols - + ["mean_blocks_per_sm_parallel"]] - test_df = test_df.drop_duplicates() - assert len(test_df) == 1 - - table += "%\n%\n" - table += ( - r"\rowcolor{gray!10} \multicolumn{8}{c}{\textbf{" - + str(compute_label(bench_config, aggregated.loc[mask].iloc[0])) - + r"}} \\ \hline" - + "\n" - ) - - def write_table_row(row, bold_values=None): - if bold_values is None: - bold_values = set() - - def bold(v, formatted_v): - if v in bold_values: - formatted_v = formatted_v.strip() - is_math = formatted_v[0] == "$" and formatted_v[-1] == "$" - if is_math: - return r"\boldmath" + str(formatted_v) - else: - return r"\textbf{" + str(formatted_v) + "}" - return str(formatted_v) - - is_first_metric_row = row.threads == 4 - is_last_metric_row = row.threads == 8 - - table_row = "" - # metric name - if is_first_metric_row: - table_row += ( - r"\multirow{2}{*}{\shortstack[l]{" + str(row.metric) + r"}}" - ) - # threads - table_row += r" & $t=" + str(row.threads) + r"$ " - # serial value - if is_first_metric_row: - table_row += ( - r" & \multirow{2}{*}{\shortstack[l]{" - + bold(row.serial_value[0], row.serial_value[1]) - + r"}} " - ) - else: - table_row += r" & " - # deterministic value - table_row += r" & " + bold(row.det_value[0], row.det_value[1]) - # nondeterministic value - for nondet_value, formatted_nondet_value in row.nondet_values: - table_row += r" & " + bold(nondet_value, formatted_nondet_value) - table_row += r" \\ " - if is_last_metric_row: - table_row += r" \hline " - table_row += "\n" - return table_row - - class TableRow(typing.NamedTuple): - metric: str - threads: int - serial_value: typing.Tuple[float, typing.Union[float, int, str]] - det_value: typing.Tuple[float, typing.Union[float, int, str]] - nondet_values: typing.Sequence[typing.Tuple[float, typing.Union[float, int, str]]] - - def values(self): - return [self.serial_value[0], self.det_value[0]] + [v[0] for v in self.nondet_values] - - table_rows: typing.Sequence[TableRow] = [] - - for threads in [4, 8]: - threads_mask = aggregated["input_threads_parallel"] == threads - det_mask = aggregated["input_mode_parallel"] == "deterministic" - nondet_no_interleave_mask = ( - aggregated["input_mode_parallel"] == "nondeterministic" - ) - nondet_interleave_mask = ( - aggregated["input_mode_parallel"] == "nondeterministic_interleave" + for functional_config in functional_configs: + mask_cols = list(functional_config.keys()) + mask_values = list(functional_config.values()) + mask = (aggregated[mask_cols] == mask_values).all(axis=1) + + # df = aggregated[mask] + # test_df = aggregated.loc[ + # mask, + # benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS + # + bench_input_cols + # + ["mean_blocks_per_sm_parallel"]] + # test_df = test_df.drop_duplicates() + # assert len(test_df) == 1 + + label = "Average @ {} SM's".format( # [{:.2f} CTA/SM]".format( + int(aggregated.loc[mask, "total_cores_parallel"].values[0]), + # float(aggregated.loc[mask, "mean_blocks_per_sm_parallel"].values[0]), ) - # print([m.sum() for m in [ - # mask, threads_mask, det_mask, nondet_no_interleave_mask, nondet_interleave_mask - # ]]) - det = aggregated[mask & threads_mask & det_mask] - print( - det[ - bench_input_cols - + [ - "input_threads_parallel", - "exec_time_sec_parallel", - "input_id_parallel", - "input_id_serial", - # "dram_reads_serial", - # "dram_reads_parallel", - # "dram_reads_rel_err", - "dram_writes_serial", - "dram_writes_parallel", - "dram_writes_rel_err", - ] + different_cols(det) - ] - ) - print("===") - assert len(det) == 1 - nondet_no_interleave = aggregated[ - mask & threads_mask & nondet_no_interleave_mask - ] - assert len(nondet_no_interleave) == 2 - nondet_interleave = aggregated[ - mask & threads_mask & nondet_interleave_mask - ] - assert len(nondet_interleave) == 2 - assert ( - len( - aggregated.loc[ - mask, - [ - "exec_time_sec_serial", - "cycles_serial", - "input_id_serial", - ], - ].drop_duplicates() - ) - == 1 + table += "%\n%\n" + table += ( + r"\rowcolor{gray!10} \multicolumn{8}{c}{\textbf{" + + label + + r"}} \\ \hline" + + "\n" ) - thousands_round_to = 1 - variable_precision = True - - # exec time (speedup) - serial_exec_time = aggregated.loc[ - mask & threads_mask, "exec_time_sec_serial" - ].values[0] - det_exec_time = det["exec_time_sec_parallel"].values[0] - det_speedup = det["exec_time_sec_speedup"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - nondet_exec_time = nondet["exec_time_sec_parallel"].values[0] - nondet_speedup = nondet["exec_time_sec_speedup"].values[0] - nondet_values.append( - (nondet_exec_time, "${:>3.1f}s~({}x)$".format( - nondet_exec_time, - plot.round_to_precision(nondet_speedup, round_to=1, variable_precision=variable_precision) - )) - ) - - table_rows.append( - TableRow( - metric=r"exec\\time", - threads=threads, - serial_value=(serial_exec_time, "${:>3.1f}s$".format(serial_exec_time)), - det_value=(det_exec_time, "${:>3.1f}s~({}x)$".format( - det_exec_time, - plot.round_to_precision(det_speedup, round_to=1, - variable_precision=variable_precision) - )), - nondet_values=nondet_values, - ) + table_rows: typing.Sequence[ParallelTableRow] = build_parallel_table_rows( + aggregated[mask], num_benchmarks=num_benchmarks, all_benchmarks=True ) - # cycles (rel err) - serial_cycles = int(aggregated.loc[mask & threads_mask, "cycles_serial"].values[0]) - det_cycles = int(det["cycles_parallel"].values[0]) - det_rel_err = det["cycles_rel_err"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - - nondet_cycles = int(nondet["cycles_parallel"].values[0]) - nondet_rel_err = nondet["cycles_rel_err"].values[0] - nondet_values.append( - (nondet_cycles, "${} ({}\\%)$".format( - plot.human_format_thousands(nondet_cycles, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, - variable_precision=variable_precision) - - )) - ) + # for threads in [4, 8]: + # threads_mask = aggregated["input_threads_parallel"] == threads + # det_mask = aggregated["input_mode_parallel"] == "deterministic" + # nondet_no_interleave_mask = ( + # aggregated["input_mode_parallel"] == "nondeterministic" + # ) + # nondet_interleave_mask = ( + # aggregated["input_mode_parallel"] == "nondeterministic_interleave" + # ) + # + # det = aggregated[mask & threads_mask & det_mask] + # + # # det_preview = det[ + # # PREVIEW_COLS + # # + ["input_threads_parallel", "cycles_rel_err", "exec_time_sec_speedup"] + # # ] + # + # print("===") + # nondet_no_interleave = aggregated[ + # mask & threads_mask & nondet_no_interleave_mask + # ] + # nondet_interleave = aggregated[ + # mask & threads_mask & nondet_interleave_mask + # ] + # + # assert len(det) == num_benchmarks + # assert len(nondet_interleave) == 2 * num_benchmarks + # assert len(nondet_no_interleave) == 2 * num_benchmarks + # + # # exec time (speedup) + # det_speedup = det["exec_time_sec_speedup"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # nondet_speedup = nondet["exec_time_sec_speedup"].values[0] + # nondet_values.append( + # (nondet_speedup, "${}x$".format( + # plot.round_to_precision(nondet_speedup, round_to=1, variable_precision=variable_precision) + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"exec\\time", + # threads=threads, + # serial_value=None, + # det_value=(det_speedup, "${}x$".format( + # plot.round_to_precision(det_speedup, round_to=1, + # variable_precision=variable_precision) + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # cycles (rel err) + # serial_cycles = int(aggregated.loc[mask & threads_mask, "cycles_serial"].values[0]) + # det_cycles = int(det["cycles_parallel"].values[0]) + # det_rel_err = det["cycles_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_cycles = int(nondet["cycles_parallel"].values[0]) + # nondet_rel_err = nondet["cycles_rel_err"].values[0] + # nondet_values.append( + # (nondet_cycles, "${} ({}\\%)$".format( + # plot.human_format_thousands(nondet_cycles, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision) + # + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric="cycles", + # threads=threads, + # serial_value=(serial_cycles, "${}$".format(plot.human_format_thousands(serial_cycles, round_to=thousands_round_to, variable_precision=variable_precision))), + # det_value=(det_cycles, "${} ({}\\%)$".format( + # plot.human_format_thousands(det_cycles, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision) + # + # )), + # nondet_values=nondet_values, + # ) + # ) + + table += "%\n%\n" + + table_rows = sorted(table_rows, key=lambda row: (row.metric, row.threads)) + for row in table_rows: + bold_values = [] + if row.metric == r"exec\\time": + bold_values = [np.amin(row.values())] + print(row.metric, bold_values, row.values()) + table += write_table_row(row, bold_values) - table_rows.append( - TableRow( - metric="cycles", - threads=threads, - serial_value=(serial_cycles, "${}$".format(plot.human_format_thousands(serial_cycles, round_to=thousands_round_to, variable_precision=variable_precision))), - det_value=(det_cycles, "${} ({}\\%)$".format( - plot.human_format_thousands(det_cycles, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * det_rel_err, round_to=1, - variable_precision=variable_precision) - - )), - nondet_values=nondet_values, - ) - ) - - # l1 data hit rate (rel err) - serial_l1_hit_rate = aggregated.loc[mask & threads_mask, "l1_hit_rate_serial"].values[0] - det_l1_hit_rate = det["l1_hit_rate_parallel"].values[0] - det_rel_err = det["l1_hit_rate_rel_err"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - - nondet_l1_hit_rate = nondet["l1_hit_rate_parallel"].values[0] - nondet_rel_err = nondet["l1_hit_rate_rel_err"].values[0] - nondet_values.append( - (100.0 * nondet_l1_hit_rate, "${}\\%~({}\\%)$".format( - plot.round_to_precision(100.0 * nondet_l1_hit_rate, round_to=1, - variable_precision=variable_precision), - plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, - variable_precision=variable_precision), - )) - ) - - table_rows.append( - TableRow( - metric=r"L1D\\hit rate", - threads=threads, - serial_value=(100.0 * serial_l1_hit_rate, "${:>2.1f}\\%$".format(100.0 * serial_l1_hit_rate)), - det_value=(100.0 * det_l1_hit_rate, "${}\\%~({}\\%)$".format( - plot.round_to_precision(100.0 * det_l1_hit_rate, round_to=1, - variable_precision=variable_precision), - plot.round_to_precision(100.0 * det_rel_err, round_to=1, - variable_precision=variable_precision), - )), - nondet_values=nondet_values, - ) - ) - - # l2 data hit rate (rel err) - serial_l2_hit_rate = aggregated.loc[mask & threads_mask, "l2_hit_rate_serial"].values[0] - det_l2_hit_rate = det["l2_hit_rate_parallel"].values[0] - det_rel_err = det["l2_hit_rate_rel_err"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - - nondet_l2_hit_rate = nondet["l2_hit_rate_parallel"].values[0] - nondet_rel_err = nondet["l2_hit_rate_rel_err"].values[0] - nondet_values.append( - (100.0 * nondet_l2_hit_rate, "${}\\%~({}\\%)$".format( - plot.round_to_precision(100.0 * nondet_l2_hit_rate, round_to=1, - variable_precision=variable_precision), - plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, - variable_precision=variable_precision), - )) - ) - - table_rows.append( - TableRow( - metric=r"L2D\\hit rate", - threads=threads, - serial_value=( - 100.0 * serial_l2_hit_rate, - "${}\\%$".format( - plot.round_to_precision( - 100.0 * serial_l2_hit_rate, - round_to=1, variable_precision=variable_precision) - )), - det_value=(100.0 * det_l2_hit_rate, "${}\\%~({}\\%)$".format( - plot.round_to_precision(100.0 * det_l2_hit_rate, round_to=1, - variable_precision=variable_precision), - plot.round_to_precision(100.0 * det_rel_err, round_to=1, - variable_precision=variable_precision), - )), - nondet_values=nondet_values, - ) - ) - - # dram reads (rel err) - serial_dram_reads = int(aggregated.loc[mask & threads_mask, "dram_reads_serial"].values[0]) - det_dram_reads = int(det["dram_reads_parallel"].values[0]) - det_rel_err = det["dram_reads_rel_err"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - - nondet_dram_reads = int(nondet["dram_reads_parallel"].values[0]) - nondet_rel_err = nondet["dram_reads_rel_err"].values[0] - nondet_values.append( - (nondet_dram_reads, "${} ({}\\%)$".format( - plot.human_format_thousands(nondet_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, - variable_precision=variable_precision), - - )) - ) - - table_rows.append( - TableRow( - metric=r"DRAM\\reads", - threads=threads, - serial_value=(serial_dram_reads, "${}$".format(plot.human_format_thousands(serial_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision))), - det_value=(det_dram_reads, "${} ({}\\%)$".format( - plot.human_format_thousands(det_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * det_rel_err, round_to=1, - variable_precision=variable_precision), - - )), - nondet_values=nondet_values, + else: + for bench_config in selected_benchmarks: + bench_inputs: typing.Dict[str, typing.Any] = bench_config["inputs"] + if not all(aggregated["benchmark"] == bench_config["name"]): + print( + color( + "SKIP: want {} (have {})".format( + bench_config["name"], aggregated["benchmark"][0] + ), + fg="red", + ) ) + continue + + print("==> {}".format(bench_config["name"])) + mask_cols = ["benchmark"] + list(bench_inputs.keys()) + mask_values = [bench_name] + list(bench_inputs.values()) + # mask = aggregated["benchmark"] == bench_name + # for col, value in zip(mask_cols, mask_values): + # mask &= aggregated[col] == value + # print((aggregated[mask_cols] == mask_values).sum(axis=0)) + + mask = (aggregated[mask_cols] == mask_values).all(axis=1) + test_df = aggregated.loc[ + mask, + benchmarks.SIMULATE_FUNCTIONAL_CONFIG_COLS + + bench_input_cols + + ["mean_blocks_per_sm_parallel"], + ] + test_df = test_df.drop_duplicates() + print(test_df) + assert len(test_df) == 1 + + table += "%\n%\n" + table += ( + r"\rowcolor{gray!10} \multicolumn{8}{c}{\textbf{" + + str(compute_label(bench_config, aggregated.loc[mask].iloc[0])) + + r"}} \\ \hline" + + "\n" ) - # dram writes (rel err) - serial_dram_writes = int( - aggregated.loc[mask & threads_mask, "dram_writes_serial"].values[0] + table_rows: typing.Sequence[ParallelTableRow] = build_parallel_table_rows( + aggregated[mask], num_benchmarks=num_benchmarks, all_benchmarks=False ) - det_dram_writes = int(det["dram_writes_parallel"].values[0]) - det_rel_err = det["dram_writes_rel_err"].values[0] - nondet_values = [] - for interleave, n in interleave_n: - nondet = nondet_interleave if interleave else nondet_no_interleave - nondet = nondet[nondet["input_run_ahead_parallel"] == n] - - nondet_dram_writes = int(nondet["dram_writes_parallel"].values[0]) - nondet_rel_err = nondet["dram_writes_rel_err"].values[0] - nondet_values.append( - (nondet_dram_writes, "${} ({}\\%)$".format( - plot.human_format_thousands(nondet_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, - variable_precision=variable_precision), - - )) - ) - - table_rows.append( - TableRow( - metric=r"DRAM\\writes", - threads=threads, - serial_value=(serial_dram_writes, "${}$".format(plot.human_format_thousands(serial_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision))), - # serial_value="${:>4}$".format(), - det_value=(det_dram_writes, "${} ({}\\%)$".format( - plot.human_format_thousands(det_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision), - plot.round_to_precision(100.0 * det_rel_err, round_to=1, - variable_precision=variable_precision), - - )), - nondet_values=nondet_values, - ) - ) - - table += "%\n%\n" - - table_rows = sorted(table_rows, key=lambda row: (row.metric, row.threads)) - for row in table_rows: - bold_values = [] - if row.metric == r"exec\\time": - bold_values = [np.amin(row.values())] - print(row.metric, bold_values, row.values()) - table += write_table_row(row, bold_values) + # table_rows: typing.Sequence[TableRow] = [] + + # for threads in [4, 8]: + # threads_mask = aggregated["input_threads_parallel"] == threads + # det_mask = aggregated["input_mode_parallel"] == "deterministic" + # nondet_no_interleave_mask = ( + # aggregated["input_mode_parallel"] == "nondeterministic" + # ) + # nondet_interleave_mask = ( + # aggregated["input_mode_parallel"] == "nondeterministic_interleave" + # ) + # # print([m.sum() for m in [ + # # mask, threads_mask, det_mask, nondet_no_interleave_mask, nondet_interleave_mask + # # ]]) + # + # det = aggregated[mask & threads_mask & det_mask] + # print( + # det[ + # bench_input_cols + # + [ + # "input_threads_parallel", + # "exec_time_sec_parallel", + # "input_id_parallel", + # "input_id_serial", + # # "dram_reads_serial", + # # "dram_reads_parallel", + # # "dram_reads_rel_err", + # "dram_writes_serial", + # "dram_writes_parallel", + # "dram_writes_rel_err", + # ] + different_cols(det) + # ] + # ) + # print("===") + # nondet_no_interleave = aggregated[ + # mask & threads_mask & nondet_no_interleave_mask + # ] + # nondet_interleave = aggregated[ + # mask & threads_mask & nondet_interleave_mask + # ] + # + # assert len(det) == num_benchmarks + # assert len(nondet_no_interleave) == 2 * num_benchmarks + # assert len(nondet_interleave) == 2 * num_benchmarks + # assert ( + # len( + # aggregated.loc[ + # mask, + # [ + # "exec_time_sec_serial", + # "cycles_serial", + # "input_id_serial", + # ], + # ].drop_duplicates() + # ) + # == 1 + # ) + # + # # exec time (speedup) + # serial_exec_time = aggregated.loc[ + # mask & threads_mask, "exec_time_sec_serial" + # ].values[0] + # det_exec_time = det["exec_time_sec_parallel"].values[0] + # det_speedup = det["exec_time_sec_speedup"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # nondet_exec_time = nondet["exec_time_sec_parallel"].values[0] + # nondet_speedup = nondet["exec_time_sec_speedup"].values[0] + # nondet_values.append( + # (nondet_exec_time, "${:>3.1f}s~({}x)$".format( + # nondet_exec_time, + # plot.round_to_precision(nondet_speedup, round_to=1, variable_precision=variable_precision) + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"exec\\time", + # threads=threads, + # serial_value=(serial_exec_time, "${:>3.1f}s$".format(serial_exec_time)), + # det_value=(det_exec_time, "${:>3.1f}s~({}x)$".format( + # det_exec_time, + # plot.round_to_precision(det_speedup, round_to=1, + # variable_precision=variable_precision) + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # cycles (rel err) + # serial_cycles = int(aggregated.loc[mask & threads_mask, "cycles_serial"].values[0]) + # det_cycles = int(det["cycles_parallel"].values[0]) + # det_rel_err = det["cycles_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_cycles = int(nondet["cycles_parallel"].values[0]) + # nondet_rel_err = nondet["cycles_rel_err"].values[0] + # nondet_values.append( + # (nondet_cycles, "${} ({}\\%)$".format( + # plot.human_format_thousands(nondet_cycles, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision) + # + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric="cycles", + # threads=threads, + # serial_value=(serial_cycles, "${}$".format(plot.human_format_thousands(serial_cycles, round_to=thousands_round_to, variable_precision=variable_precision))), + # det_value=(det_cycles, "${} ({}\\%)$".format( + # plot.human_format_thousands(det_cycles, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision) + # + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # l1 data hit rate (rel err) + # serial_l1_hit_rate = aggregated.loc[mask & threads_mask, "l1_hit_rate_serial"].values[0] + # det_l1_hit_rate = det["l1_hit_rate_parallel"].values[0] + # det_rel_err = det["l1_hit_rate_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_l1_hit_rate = nondet["l1_hit_rate_parallel"].values[0] + # nondet_rel_err = nondet["l1_hit_rate_rel_err"].values[0] + # nondet_values.append( + # (100.0 * nondet_l1_hit_rate, "${}\\%~({}\\%)$".format( + # plot.round_to_precision(100.0 * nondet_l1_hit_rate, round_to=1, + # variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision), + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"L1D\\hit rate", + # threads=threads, + # serial_value=(100.0 * serial_l1_hit_rate, "${:>2.1f}\\%$".format(100.0 * serial_l1_hit_rate)), + # det_value=(100.0 * det_l1_hit_rate, "${}\\%~({}\\%)$".format( + # plot.round_to_precision(100.0 * det_l1_hit_rate, round_to=1, + # variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision), + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # l2 data hit rate (rel err) + # serial_l2_hit_rate = aggregated.loc[mask & threads_mask, "l2_hit_rate_serial"].values[0] + # det_l2_hit_rate = det["l2_hit_rate_parallel"].values[0] + # det_rel_err = det["l2_hit_rate_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_l2_hit_rate = nondet["l2_hit_rate_parallel"].values[0] + # nondet_rel_err = nondet["l2_hit_rate_rel_err"].values[0] + # nondet_values.append( + # (100.0 * nondet_l2_hit_rate, "${}\\%~({}\\%)$".format( + # plot.round_to_precision(100.0 * nondet_l2_hit_rate, round_to=1, + # variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision), + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"L2D\\hit rate", + # threads=threads, + # serial_value=( + # 100.0 * serial_l2_hit_rate, + # "${}\\%$".format( + # plot.round_to_precision( + # 100.0 * serial_l2_hit_rate, + # round_to=1, variable_precision=variable_precision) + # )), + # det_value=(100.0 * det_l2_hit_rate, "${}\\%~({}\\%)$".format( + # plot.round_to_precision(100.0 * det_l2_hit_rate, round_to=1, + # variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision), + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # dram reads (rel err) + # serial_dram_reads = int(aggregated.loc[mask & threads_mask, "dram_reads_serial"].values[0]) + # det_dram_reads = int(det["dram_reads_parallel"].values[0]) + # det_rel_err = det["dram_reads_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_dram_reads = int(nondet["dram_reads_parallel"].values[0]) + # nondet_rel_err = nondet["dram_reads_rel_err"].values[0] + # nondet_values.append( + # (nondet_dram_reads, "${} ({}\\%)$".format( + # plot.human_format_thousands(nondet_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision), + # + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"DRAM\\reads", + # threads=threads, + # serial_value=(serial_dram_reads, "${}$".format(plot.human_format_thousands(serial_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision))), + # det_value=(det_dram_reads, "${} ({}\\%)$".format( + # plot.human_format_thousands(det_dram_reads, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision), + # + # )), + # nondet_values=nondet_values, + # ) + # ) + # + # # dram writes (rel err) + # serial_dram_writes = int( + # aggregated.loc[mask & threads_mask, "dram_writes_serial"].values[0] + # ) + # det_dram_writes = int(det["dram_writes_parallel"].values[0]) + # det_rel_err = det["dram_writes_rel_err"].values[0] + # nondet_values = [] + # for interleave, n in interleave_n: + # nondet = nondet_interleave if interleave else nondet_no_interleave + # nondet = nondet[nondet["input_run_ahead_parallel"] == n] + # + # nondet_dram_writes = int(nondet["dram_writes_parallel"].values[0]) + # nondet_rel_err = nondet["dram_writes_rel_err"].values[0] + # nondet_values.append( + # (nondet_dram_writes, "${} ({}\\%)$".format( + # plot.human_format_thousands(nondet_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * nondet_rel_err, round_to=1, + # variable_precision=variable_precision), + # + # )) + # ) + # + # table_rows.append( + # TableRow( + # metric=r"DRAM\\writes", + # threads=threads, + # serial_value=(serial_dram_writes, "${}$".format(plot.human_format_thousands(serial_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision))), + # # serial_value="${:>4}$".format(), + # det_value=(det_dram_writes, "${} ({}\\%)$".format( + # plot.human_format_thousands(det_dram_writes, round_to=thousands_round_to, variable_precision=variable_precision), + # plot.round_to_precision(100.0 * det_rel_err, round_to=1, + # variable_precision=variable_precision), + # + # )), + # nondet_values=nondet_values, + # ) + # ) + + table += "%\n%\n" + + table_rows = sorted(table_rows, key=lambda row: (row.metric, row.threads)) + for row in table_rows: + bold_values = [] + if row.metric == r"exec\\time": + bold_values = [np.amin(row.values())] + print(row.metric, bold_values, row.values()) + table += write_table_row(row, bold_values) print(table) utils.copy_to_clipboard(table) @@ -994,27 +1800,46 @@ def values(self): def flatten(l): return [item for ll in l for item in ll] -@main.command() -@click.option("--path", help="Path to materialized benchmark config") -@click.option("--bench", "bench_name_arg", help="Benchmark name") -@click.option("--nsight", "nsight", type=bool, is_flag=True, help="use nsight") -def correlation_plots(path, bench_name_arg, nsight): - profiler = "nsight" if nsight else "nvprof" + +def load_stats(bench_name, profiler="nvprof", path=None) -> pd.DataFrame: stats = [] - if bench_name_arg is not None: - stats.append(pd.read_csv(REPO_ROOT_DIR / "results/combined.stats.{}.{}.csv".format( - profiler, bench_name_arg - ))) + if bench_name is not None: + stats_file = REPO_ROOT_DIR / "results/combined.stats.{}.{}.csv".format( + profiler, bench_name + ) + print("loading {}".format(stats_file)) + df = pd.read_csv(stats_file, header=0) + if len(df) < 1: + print(color("WARNING: {} is empty!".format(stats_file), fg="red")) + else: + stats.append(df) else: b = Benchmarks(path) benches = flatten(list(b.benchmarks[Target.Profile.value].values())) bench_names = set([b["name"] for b in benches]) for bench_name in bench_names: - stats.append(pd.read_csv(REPO_ROOT_DIR / "results/combined.stats.{}.{}.csv".format( + stats_file = REPO_ROOT_DIR / "results/combined.stats.{}.{}.csv".format( profiler, bench_name - ))) - stats = pd.concat(stats, ignore_index=False) - stats = stats.sort_values(["benchmark", "target"]) + ) + print("loading {}".format(stats_file)) + df = pd.read_csv(stats_file, header=0) + if len(df) < 1: + print(color("WARNING: {} is empty!".format(stats_file), fg="red")) + else: + stats.append(df) + + stats_df = pd.concat(stats, ignore_index=False) + stats_df = stats_df.sort_values(["benchmark", "target"]) + return stats_df + + +@main.command() +@click.option("--path", help="Path to materialized benchmark config") +@click.option("--bench", "bench_name_arg", help="Benchmark name") +@click.option("--nsight", "nsight", type=bool, is_flag=True, help="use nsight") +def correlation_plots(path, bench_name_arg, nsight): + profiler = "nsight" if nsight else "nvprof" + stats = load_stats(bench_name=bench_name_arg, profiler=profiler, path=path) print(stats.shape) stat_cols = stat_cols_for_profiler(profiler) @@ -1024,7 +1849,6 @@ def correlation_plots(path, bench_name_arg, nsight): stat_cols += ["instructions", "l2_accesses", "dram_reads", "dram_writes"] for stat_col in stat_cols: - stat_config = STAT_CONFIGS.get(stat_col) or StatConfig( **{**DEFAULT_STAT_CONFIG._asdict(), **dict(label=stat_col)} ) @@ -1038,7 +1862,9 @@ def correlation_plots(path, bench_name_arg, nsight): print(bench_name) bench_input_cols = benchmarks.BENCHMARK_INPUT_COLS[bench_name] - bench_df = bench_df.set_index(["target"] + benchmarks.SIMULATE_INPUT_COLS).sort_index() + bench_df = bench_df.set_index( + ["target"] + benchmarks.SIMULATE_INPUT_COLS + ).sort_index() def gpucachesim_baseline(target, memory_only=False): # "input_mode", "input_threads", "input_run_ahead", @@ -1051,7 +1877,9 @@ def gpucachesim_baseline(target, memory_only=False): **{c: "mean" for c in set(bench_df.columns) - set(group_cols)}, **benchmarks.NON_NUMERIC_COLS, } - aggregations = {col: agg for col, agg in aggregations.items() if col in bench_df} + aggregations = { + col: agg for col, agg in aggregations.items() if col in bench_df + } native = bench_df.loc[Target.Profile.value] native = native.groupby(bench_input_cols).agg(aggregations) @@ -1059,14 +1887,24 @@ def gpucachesim_baseline(target, memory_only=False): accelsim = bench_df.loc[Target.AccelsimSimulate.value] accelsim = accelsim.groupby(bench_input_cols).agg(aggregations) - gpucachesim = bench_df.loc[gpucachesim_baseline(target=Target.Simulate.value, memory_only=False)] + gpucachesim = bench_df.loc[ + gpucachesim_baseline(target=Target.Simulate.value, memory_only=False) + ] gpucachesim = gpucachesim.groupby(bench_input_cols).agg(aggregations) - gpucachesim_memory_only = bench_df.loc[gpucachesim_baseline(Target.Simulate.value, memory_only=True)] - gpucachesim_memory_only = gpucachesim_memory_only.groupby(bench_input_cols).agg(aggregations) + gpucachesim_memory_only = bench_df.loc[ + gpucachesim_baseline(Target.Simulate.value, memory_only=True) + ] + gpucachesim_memory_only = gpucachesim_memory_only.groupby( + bench_input_cols + ).agg(aggregations) - gpucachesim_trace_reconstruction = bench_df.loc[Target.ExecDrivenSimulate.value] - gpucachesim_trace_reconstruction = gpucachesim_trace_reconstruction.groupby(bench_input_cols).agg(aggregations) + gpucachesim_trace_reconstruction = bench_df.loc[ + Target.ExecDrivenSimulate.value + ] + gpucachesim_trace_reconstruction = gpucachesim_trace_reconstruction.groupby( + bench_input_cols + ).agg(aggregations) print("native ", native.shape) print("accelsim ", accelsim.shape) @@ -1074,16 +1912,23 @@ def gpucachesim_baseline(target, memory_only=False): print("gpucachesim (mem only) ", gpucachesim_memory_only.shape) print("gpucachesim (exec driven) ", gpucachesim_trace_reconstruction.shape) - targets = [ (("native", "native", "o"), native), (("AccelSim", "accelsim", "o"), accelsim), (("gpucachesim", "gpucachesim", "o"), gpucachesim), - (("gpucachesim (memory only)", "gpucachesim", "x"), gpucachesim_memory_only), - (("gpucachesim (exec driven)", "gpucachesim", "D"), gpucachesim_trace_reconstruction), + ( + ("gpucachesim (memory only)", "gpucachesim", "x"), + gpucachesim_memory_only, + ), + ( + ("gpucachesim (exec driven)", "gpucachesim", "D"), + gpucachesim_trace_reconstruction, + ), ] - assert all([len(target_df) == len(targets[0][1]) for _, target_df in targets]) - + assert all( + [len(target_df) == len(targets[0][1]) for _, target_df in targets] + ) + plt.rcParams.update({"font.size": fontsize, "font.family": font_family}) fig = plt.figure( figsize=(0.5 * plot.DINA4_WIDTH_INCHES, 0.5 * plot.DINA4_WIDTH_INCHES), @@ -1115,14 +1960,17 @@ def gpucachesim_baseline(target, memory_only=False): stat_col_min = all_targets_df[stat_col].min() stat_col_max = all_targets_df[stat_col].max() - + if stat_config.log_y_axis: log_stat_col_max = np.ceil(np.log10(stat_col_max)) - stat_col_max = 10 ** log_stat_col_max + stat_col_max = 10**log_stat_col_max log_stat_col_min = np.floor(np.log10(stat_col_min)) - stat_col_min = 10 ** log_stat_col_min - tick_values = np.arange(log_stat_col_min, log_stat_col_max, - step=int(np.ceil(log_stat_col_max / 6))) + stat_col_min = 10**log_stat_col_min + tick_values = np.arange( + log_stat_col_min, + log_stat_col_max, + step=int(np.ceil(log_stat_col_max / 6)), + ) tick_values = np.power(10, tick_values) xyrange = np.arange(1, stat_col_max) @@ -1131,7 +1979,7 @@ def gpucachesim_baseline(target, memory_only=False): else: xyrange = np.arange(stat_col_min, stat_col_max, step=1) tick_values = np.linspace(stat_col_min, stat_col_max, 6) - + ax.plot( xyrange, xyrange, @@ -1149,8 +1997,10 @@ def gpucachesim_baseline(target, memory_only=False): zorder=1, ) - tick_labels = [plot.human_format_thousands(v, round_to=0) for v in tick_values] - + tick_labels = [ + plot.human_format_thousands(v, round_to=0) for v in tick_values + ] + ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) ax.set_xticks(tick_values, tick_labels) @@ -1165,10 +2015,12 @@ def gpucachesim_baseline(target, memory_only=False): filename.parent.mkdir(parents=True, exist_ok=True) fig.savefig(filename) - # create one plot for all benchmarks if bench_name_arg is not None: - bench_df = stats.set_index(["target"] + benchmarks.SIMULATE_INPUT_COLS).sort_index() + bench_df = stats.set_index( + ["target"] + benchmarks.SIMULATE_INPUT_COLS + ).sort_index() + def stat_cols_for_profiler(profiler: str) -> typing.Sequence[str]: stat_cols = [ @@ -1220,6 +2072,7 @@ class StatConfig(typing.NamedTuple): grid: bool percent: bool + DEFAULT_STAT_CONFIG = StatConfig( label="", log_y_axis=False, @@ -1412,7 +2265,8 @@ def compute_label(df): label = "Transpose\n" label += "{}\n".format(df["input_variant"]) label += "{}x{}".format( - int(df["input_dim"]), int(df["input_dim"]), + int(df["input_dim"]), + int(df["input_dim"]), ) case "babelstream": label = "BabelStream\n" @@ -1432,7 +2286,7 @@ def compute_target_name(name): return "AccelSim" case "profile": return "Native" - + per_kernel["label"] = per_kernel.apply(compute_label, axis=1) per_kernel["target_name"] = per_kernel["target"].apply(compute_target_name) @@ -1521,7 +2375,9 @@ def compute_target_name(name): stat_col, target_name, target_idx, - str(inputs[benchmarks.BENCHMARK_INPUT_COLS[benchmark]].tolist()), + str( + inputs[benchmarks.BENCHMARK_INPUT_COLS[benchmark]].tolist() + ), inputs_idx, idx, target_df[stat_col].fillna(0.0).mean(), @@ -1590,7 +2446,7 @@ def compute_target_name(name): if stat_config.log_y_axis: assert not stat_config.percent ymax_log = np.ceil(np.log10(ymax)) - ytick_values = np.arange(0, ymax_log+1, step=int(np.ceil(ymax_log / 6))) + ytick_values = np.arange(0, ymax_log + 1, step=int(np.ceil(ymax_log / 6))) ytick_values = np.power(10, ytick_values) print(stat_col, ymax_log, ytick_values) ax.set_yscale("log", base=10) @@ -1606,13 +2462,17 @@ def compute_target_name(name): ax.set_ylim(0, ymax) ytick_values = np.linspace(0, ymax, 6) - ytick_labels = [plot.human_format_thousands(v, round_to=0) for v in ytick_values] + ytick_labels = [ + plot.human_format_thousands(v, round_to=0) for v in ytick_values + ] ax.set_yticks(ytick_values, ytick_labels) ax.legend( - loc='upper left', + loc="upper left", bbox_to_anchor=(1, 1), - edgecolor="none", fancybox=False, shadow=False, + edgecolor="none", + fancybox=False, + shadow=False, ) filename = plot.PLOT_DIR / "validation/{}.{}.{}.pdf".format( profiler, bench_name, stat_col @@ -1647,13 +2507,25 @@ def compute_target_name(name): help="target", ) @click.option("--verbose", "verbose", type=bool, is_flag=True, help="verbose output") -@click.option("--strict", "strict", type=bool, default=True, help="fail on missing results") +@click.option( + "--strict", "strict", type=bool, default=True, help="fail on missing results" +) @click.option("--nvprof", "nvprof", type=bool, default=True, help="use nvprof") @click.option("--nsight", "nsight", type=bool, default=False, help="use nsight") @click.option("--out", "output_path", help="Output path for combined stats") def generate( - path, config_path, bench_name, input_idx, limit, quick, - target, verbose, strict, nvprof, nsight, output_path + path, + config_path, + bench_name, + input_idx, + limit, + quick, + target, + verbose, + strict, + nvprof, + nsight, + output_path, ): benches = [] @@ -1681,7 +2553,9 @@ def generate( if limit is not None: benches = benches[:limit] - print(f"processing {len(benches)} benchmark configurations ({len(targets)} targets)") + print( + f"processing {len(benches)} benchmark configurations ({len(targets)} targets)" + ) with open(config_path, "rb") as f: config = GPUConfig(yaml.safe_load(f)) @@ -1774,9 +2648,7 @@ def generate( if verbose: print(all_stats) - stats_output_path = ( - results_dir / f"combined.stats.{profiler}.{bench_name}.csv" - ) + stats_output_path = results_dir / f"combined.stats.{profiler}.{bench_name}.csv" if output_path is not None: stats_output_path = Path(output_path) diff --git a/plot/validation/nvprof.simple_matrixmul.cycles.pdf b/plot/validation/nvprof.simple_matrixmul.cycles.pdf index 2a4091cc530bd710ece11711eee61353606692cc..032104c0d3f710d5f3dad679f91c3271735abc77 100644 GIT binary patch delta 4402 zcmZWrc{r5q7f!YsHQCBK3}Y-6vu_AxkPt%Ig)EaI%V*8JsVrleWMtpcV#&Tm6j_Qd zS+eF!wj!Z0wov^>{jTr&;`!sf&Ybr?=iKK$=XtMLxy{xX!&Z{U23Q#|-c|8EWuBW) z5^v`e9nopijgNa()H-xn@(f8nwJ#j=h-_(b_D8U-BauuXV&ZU)g%`~dDmUf_cS(rF zxX0G5^(=3kJ2#-N7D%LS_*A>r+YEz!H$?!uXlK?FfXuD!0qpj1)erBxdy{r%Ps{i~ z%x2$1;sbu~U6*}l0q+AdP8GL` zJ~Z#3aow9xF*nJ{6|+}g#6%+}dLPR0f_|B??>miCCdnHRi4!;M!U^n#Ofh|zC3p0L z!BOi7rwS)8xP{n@N2z0DAFnJ3sXSa*3^1Esv}+o9slGWs3-oQwENr-ZegABI{yQR7 zo&Hl8A$b6JCY~W*PAMZLIe6jRa!#-@m0g!>?sa#EuaB@X*klckqD&doMPYAMw2{gW zZ)WBrK+nyr>bds2cS+tpXAmJ02ipTNZ}wMoZw6abFs9eoskoySRMEgA{LUXxA*Y>>_hS)A?y;c8oKA zV=oU@La-y8`0YcQaj9vRB#m;&-JbC{@BY;nSzNV%l&pZNS#73_bqTIuEc3?MSMkVpa&=qPHl=uAT# zp??BE!?kW4(3L`1-#1Due2_)Eq~z4j0k@WsEcDiiN7|W!+{B9uBlcTRq~ zz!c}%@$iO!vqe@bgnhal%1!Qx*Bt%vrW?*oe;;>Goa5DJ-cknxr?clFCt!(rS#?=i ztvtDHymw-yx#B)>`CcG+4wv1LE@+@fL!&rGEVOJJ=;Api4^i*6@(kYY0R*MoyPC`! z=xK7Mh~r}K?Nk0}8w6o$Rep8ym&k zESUf{6|_rKb#Y4FH3&(=$Zc8P*IXy2OQ0sxE1MP9SaeFTZ@)zeT{Pb^m%*!+d?vMgv)Blclf~pIK_Z@;yk_ z-D6Kpi)Xlcy6AK-$mzraOAS_Kb46O*icQGMl99ldHx_nX8T4RY^$+wdDk^us!X9e` zDFgT>VMl2LyDs{dQpv`g%CAc24NuShbyw_^rb=J~JwpBs$x$;%`K*4smIs`bvPgD0 zeHCAk1FMqe82XjT>Ykj?ljlcSyuS$N2^TuD;+nZAM&6%Nd%H3u@~1+7p*CdfyWLhV zvrzeC|Kxlxy$d<^)>8G8KA_7YV_Y@r>$k)?H$8~30V`MX=br2&VSa{rFR8vwNf%1l zcNKo-s66$6?9|AQuNS?8gC1UQ(ZA{+X(N1upYjYB!t+prQ<1tMP!Qws*w8@f$I9$* z?abq16~MzLSXj5_8z#6G z$Eor_+5|pmZ91H8E`GNBxWv#SFUROl;!kW0I3N(oxi$oM)LNcmW?!o z8S?=UojU0GV`rwVC&x~SFZ8_dc``M40qj$CK)uh;)T36xC!Ll&T=|ukKRco%HzR&GbFx=0nw!( znRh{Bep;vfB!s?)yk8m1d2(yE@b&T4hm*bo;G03w*+;l~TyU*?rF>ZrOI{RuH}^Nc zL~hD74yLFmZ875J|N8n0;sq>optuJ)yYJN99zP{XVD3l9N<&Vd;lP0E4KtU^&T@f< z=8=&hLe%VrJTB399mV6+DNn1Zw)4t{?&(ZHszXwlRgP+5sG;^A^}f(9miRM=l}`7) zl2zM~OB@h;swi5+CgV~8p@oX~V`P(Mxx_L9 z96+&TZ3LAv87s<_NYb%z;uNWu$*ur@&Td;*h3mt$P9ueQHDbrP}`D-p`O7*71Ddv(1RoJPIc>(L}#8`X4>2OwADr&?Tuxn|0iC70Us1Yg(0OPam9Nd6G;fJ|+d z2Rn9RvaNA6>pBPCS~8;7?lXiOS0ZuSdzPja6yWTa!4~=r-x4TI@ojP($kWtJ8iw(C zncjbB>a`Bh#%_T6MO_3GIoch*Rp>|sQ}L2-(x(&V!Y0l|xsis3C;AtGO((xK?+iBD zgGw(Ov(CUR3{xb}rFSda_VUr4pS{~bD4Wq%Vb zGrH#FVQ&lVir|lat2XaL03qO`oW|mvkl}D@N#$3!=c$@@JlE?xSAHPC@EQ@4^Tv3{ z2-R!N{~eW3^L;e7KDpVf>-?&;U&f`d%szxBXu*%MNvAasKvU(DH9{2ahR;ztA{)2V z`pcQSYITB|Rum4FbrDx@w$3H8HJS9UX-{khE`t+&Icza~A7}J|lQQboOUr$oan4iL z>vz}U{8FXrud6Fy^CQ{KZ#q5U zRR5JPeH(6tnz8hGk&jX0YR=c?`G!%#Ow*8OLR%0Rv+_cTUuvf2IVVH82|GV$)pm z^P{G}KgLDBHxUw1)oAeT2k9q`!)b8(pqL$+42ct~Cv#_ul?lFts!msjoh{eA43ouX zH{$fyM~DRWDo*qPrW*S=NQnNb$GFlR8@WFd?3n0Tm)v($&NAF01MBOgqx6_NZ%?$v z+gWoU>R_IPoS}=hMQKV(LBW2c6=aPHt)qJ^XuY0$W zQ6zW#Od5E4g|0;IX%wsci>Sn-CY-*(HZ0*q7Qa7KaTV+NCD`$B^mcccUuTvE3fywU=UTq#$EEli*sCauC&&Xm#OlR!9A ze4U^OZ;C$@hle7ZC@!u(P&^KbI1a@VKnx4K4hIzB0V=H%C_D;_C7~E-vOa*vlL%1c)`7Z1LB9+X4z-nVyAQtu zhsPsX1pmb0@wgqacsv2W4aW*%>48KtzLTZd(O4`)S_=-K(X8J7Oozdfc9g(iiQChm zFi7N%SS$gv9f!oA@hsK;sDZ>_f42+#M*}2=fM<_hj?;uDd{(l33|F9;ZQ9FW3n4JVk1Xi!x z$dE`|x8)ClDAtg-tBPW^{0HeR{ZU9F<`3tVkA>VmV-)L>?DP?_JA7y)X?w>g)}HUE zfyN?%?F)iJ6LwxO90s?&V-$|S`pj(W7?0X{vGD}le|E;l%bs${$qS^a3PqSu0-YEQ SCy(&3?i&^ahie&XgZ>9M=o0Y& delta 4966 zcmZuwX*iS()P5z&*s^a8V~{P(zSzFT8m2-+St9$`xAEE=vTs?&k}Yc#gRE_+Ms|r5 zrNtgXmJlVr@pfI`ReaBnxz2OXIp;q2Irl?LV%SV&xR=L3w(L$jd7rCgr8S?!UxUV1exvSU))!cq>Mg_)*85V$RJ+zX@XM&GBUc34uo#R z+VfE`f1w&(>$3lNjz95(9ZqFwqFOTi4t(SjXo_+H2 zWdWWXq8>VOw~_X6jsiEU;?`4fPj(H}Yg({gd!kD)SGb<4*hPH>VEwooS>&?Y`E_h$ z@5ewzFZGoo3R=bW29&Evm7;t3r9wYSJKRuub45u-+WMWH`XZ} z?L8QDqux2$(ANEWPIS{THND&>V;NT&0#W3CNz%G7PYWEi&?4_{J{pjRx^u#m*4#4e zON4i@IOO4LZ&%?i=pv<-*q&9T+l{ybVXUulS_ItXjx0-`e^sYQ%MB^q2EPY|0Rn%>hirw zH#VRvpIW!Dl^qvfU-&{Ndp^4jJJqDc-_d;=TQ)3FO7s+H^j|?e(fIHWjFevAb=nso21jX~s_ZS=W&I z`j7r&ro24FRDY#+qI-n|e{yNNCnyJUKbUHsk`yu9;6C8@jl*?z>bIyR^;YFOI4QXIw#|Gc9M@3iE5 z+S$P|8exm=tUc`tnbu+sp26C>-u9P2Je2EsE)gG2OzrlY$BP&p7P>{$HRx%24+=HMDTaq>p`jQ}AYI;>Qv~NQ)$(nnzw^AjTD1gDcCov}S5?U+2tvtDTL^!5JK)?jO~iLvdubgs~dy#3eKhq#o-h#_48MVsjstMe!RozgHgNNvh^}LM|=pH=0H` zN-BVrXu=*Jy3E~+rA{9m)c`fSEUX0Uitzs%F}v2;qkyVmq;#pbAHtF z25MA!=hI+xb(f07Onx4!#F;q-!O?&`*CR_BbVVnlC<)tSs0p2-ee=5KEW& zVHa!>r_MbHr-ikJGyM02sK)ENA)Q4nDR0QnOvmf5#IQFN&>)wy z-_7JRPssp8?T%F32BRuN#%ryM|L~JGXG=ym9vi}CBbo)dL4|3afWqii6f`8A z{_wWMDDS&*$L=X>vO4p9)33`c`SThiYahP{$ej>@FEJL5P_|1>{@wyG<@mF!_O~H8 zan~bvPr?^RwU}ay+WIYT65N5UwLHl00?vXnzjh-VVrpq$6kdsGX6cUTScJda1fkx- zVr&|e@An29#)AzpLaqoR&3M}Fq-#T2T$-{-#EaRAq$Z9r^4o}<$~>&@$Z*VS>s3ir z!;Kc4U>zy--~CbCq;1<2>II`K>mNVhd4>hWUsJsjJ;nJ>mYhBgxs0N`_yNTl za91-&D|u!F*tH1k8p7Qp#X0rezqoscYge%H`>2NWn})%B&a{dOj@?m|YQFsL&yiG9l#*s` zt7Tj#f;-J1;Ac)pE-*edD}5Yy8LiW!&)Q?&OpZd0-F721u7uk4O;=J^vNEV20v{=I z@93La<)&*s61=zeIj`b3fk~G$vz&cNuL?6oaDz=BsJk+*kO z4V8~)85TEW0X)zUqG`HHmR3%kOHi6zppdS7dht-vjL0;=K!?(O57Xpy-y)mbfMLtj{c+VLiHX%3MNP5iwq$QD^$R}z=#8(6pnZ%# zHP5HI*^a~H*kTW}>&$?+1SQ8mG5I~PV2Sf9O1Oc&FDZDStbd{)10A2=Gc8e+#Fn-c zA*-#NEg-P)L4U(gRDi%SW&8RBUc2k@>%rR_0tBtq~plg=SUD{%)PpuxBOy!$)%6hg7fj9cBc& zXr&%;3lWOrj^E@T^TPmT6-ydigdhC0E!oso4-CdD9IF$tzo-ZRiso7CtR2^AzZ%dR zyBwtYw1^|1vizAZ_Ht_^3oo)Dwqq@f4;cF46RsEY4wwANo*WPwO0Eyy{M@wX)!-(u z@)Da(bQ+&qa6_E6#}k^JXiPqLm^?QSTQxStiM}~;p4K5D-{#9^pEcfC5qf;IYWrr< zka$psMtCsy>CiQ=a0@BET_o?xW2xn`0UHGc)z+3zTDJ#&w)qe3{gXHPWe(urVed=w zaj=Lj5531^oi8ynn^f*yZAA{!E(NkexB ze!BA};nq*j&d)7o>1U26x>r=xo=8%9e8=zJ2u&>b^pRl!ayDO|0F}GkX118qD!C>} z*~{r{fC5uyjXX={eQ}pI(|XaIbDBPVFH$4fzJ{oECtc&PMK_D2E(%g1lfE6C@})%; zlMB#dS)Oy{k>uVOc~g@-G2Am@%}s9O64hrT$_5{ozBGBQUrR{yTF$u>u4dMuyHSiMd>giyEOV7z)kAB={hX=4gipVbhu3Oli{xPOoboAdqn=tv zkUiFZ?x@l1zEk` z{KZMK2pXfA@Sc7?AQ+Z@Tw{Qe69^93k3dR95FGkDat6U+zav)=9R53U1Hlo$BXLX|QPYp<%GteOv!U42AqZ z_Jts@2X738V4(+vK%sERUnkz*KOH>meJBck@a|9;1brAg><=gmMgR5>3=Kmc=73>Q zL99l`(r diff --git a/plot/validation/nvprof.simple_matrixmul.exec_time_sec.pdf b/plot/validation/nvprof.simple_matrixmul.exec_time_sec.pdf index 2bc59486059501c6eb8b48947574b5f184fdb2e4..7d4049c79a513418ae1eb8407e3fc4e62feca239 100644 GIT binary patch delta 4488 zcmZuzc|4Tw*KRCf$~u+^J~ov90TiEivlaKo;A%XkH1l!6qE#6qB7D5lZ%f7&xMEbcmy%rkEP_#==-Wzf)qNhb>y`zsO3>T1;Ovi4!+JlWo-I{vd4z6wb?k>sHtn$XnkQP+OJVv(I zxF~W)xjB;a9p`G8aN@u~c#?Wk^nGiKk{aNwm3)9e${p{-gW82NUS2Bwc&5a|>4YRj zQRfHM2+y9IA7noa+nwmHxQMjM14?GTxW~plGDoyjhS7C zUE>0mxV@{4Ot?EI27kohnYUx*%j;E-EQxtn$ELLrY^!J%OL18iydZ10f+_q#3Fqj* z3aug8iwMGD_6-IZ)#}+B2lAG94rw{J!A;{a+Xg9!6u&TpP2h>{{A&8+y%Nv*g!A9} zgi?vno5`7wZNvNxT8X>O)+_-(rM5aG0aM~fXvi|puJPTf;ztPGehMgfwcYNm&la@! zqE=Dh=_^B!9D`egv>MK#PMR)9D`S;=y6UM#i51l1H}}lew=*6Jv+IGCwqBQS35w+Q z;SmzC3m+-6tq!Y(H0Ijm4I>{mQJGsDkNbOz+xlsQMkJ(q)$SD==1(d%8>r+Ie=HMv z^YJ!0YnZ`47FLx;^QC!4R6_=cpO3q0Jmsk56gezqdbZHEo)zznw6$^`U%c4NR<8Fl z1H+FWHr=~xhNJEY)Y_RfmwLM~BuI8pzedNdyZ$G||JT^W-0RBs9UuBysNd%auXU&w z_cE_u)(CIm zC~>-zjiw85N@|gU(tgl!F__w9dsgh%ThDpBlj3KeSB!DtEck*|PNjC;!1ro8YLxIJh8F z4^{EW=Z*tW%Kl1E$bpPkUutPyp`YJq2cHi=PK&U~5hcuCB~&idWX64Pw4O248sHcW zD!1mk>Y7MPz4Wiam1l^s=~u-Z%;tXN!*6#-AK~IhOGEuoGqU%ur6s4R(z;}3pZdy( zbWnI7mitzzhU^M!()(s_L@gSx3zcsexITV?c_(Yn&SwGgCb@%|j9FW&@5a@NMqpaG z2cid-clrDMtF#QYH=oUB3WnY-$tiK1x*w7ArB+=x6a(*_a=KZlpj?F)*D1cyk{&s0 zYaBj#&XS>kNbY$4kY|E*SdW&Gz_FOqYgySct7j~WY88~C{ercbQ(%|xw(Q(&_CzQ|!0=0qrd#xj^@+XT6GP_}1oQQWj8WwQ3Pv|??Er-fM$(u(gD2|0G5+RF;j z`D0NG`y^KzuY%*H6UYo_#$vT?z&*ZB7eaQhdeEWun0}HlU^| zgS^Mgx@}0Z`0dY@j9s_dAzyB5{2~tX_-RL{C0k#fLhSei%cJ{RXohoWUJ`s zN!e{7esed?VCK%g?un6YM&x2=rGH`l*5BotU&w_Q834=o575fad~%$dlR8eBQ;1Nn z)l4=EcHus^@0x_hDx)Xylk9VU^N%9=7-bJ-xpvg+Xz6#;@i9GoIm{=DjhH8fb6bi9 zbM`A#8~0PW6Dz3Q9AD$|N?$eHPze@gk|>#3^sMx%h=i;F2#q~4J{pS=(|13@$&aJ zZMDJ+rS1v#Iv367UvqacmgS?Dhd7q^bKH3|KCRSyG)>q`^D<`agFIhwm&sTb)G}r9 zqy0lkZrfdiN&U+UpHhd^-)P>JnrKyZa=Mr05hE*qPN1E@CsSACA3hd#Rs(uVe|%1E ztJ14cB`drW;|o5Z5U~gCrOlK*nQ39 zJ1Ao1;5c;Xdm6IrML>4+E~gR4keb5alQO*2lODI3tzzYt))PIp)eShlJ+n+7XYW%H z_2FU6J&Q!{(%(PFRz56e#ZIZqhDiV7k7FOkbBXP!t+K0&4~u zizLy7`7c4heS)>%4nY|BQ}7x%DF~zA5XyppSA}47eUW(xh!TO(O+{59AVE}&9w(*@ z0ndrS=+DGYL%>JkET0`cET0`PaIe%bC?N&oMB|ZQqQGvd#UV2vZWQm(N71?lut_l~ z+vDE7*iWmKxVRB(2}#ZU4MF`-^YnM{Xn4Y#ohQ)xGATRj^?7JJ%SBrO%}-OfU!`%(b~TsflQ)eow*{yH)0#ciE<9!%-8L({=fkS|ZooE`x6e ztIFm3a(W7T>z@B>5dVTlOiCu~TK|;z*Qxo!rSFoXR4%7zr}OlADK8%Sg1iC*9FvEE zs_-OG5-v*bhGQUL9UKNqC{BP}iejLM(tB`KQ557;?ghUo!RTL<6CfZ{S(Ki#?;->Y z*#`pwRU?pJRTR9aItY5G!swoAc@Xf38jKDg@vOf^q$qd>H4fUMV4yQP1U!g_fph32 za1;%rU&r`z$UzbAr-R%8M8N6u01giz+)jIVQUE*-Kxt0# zKVR+uny_K9=z#p1j#OU&vjO?~xB(dApKU(?vyt4-ne67{<>pQSup1=S5%PLm>^d7L zf(&3c*aQIBbvA!W#cuEk@N;%`1F##MSfMufP&~EBZmf-U22cd$oG*agU=RpkHy8u~ zxQ)y#?KaB4;70ZZaA**%^$~&r|Irx&ziC&2lG?*y?0zO#sxu7gA83`>U}B60a2rI7 zF#rw+?$!$eJ$1K(cl9J8D6mdX7lKOBm*hp^QCKVqg#qRCcR~oDmOg@t$C6ll))wFc zK-UcvmV{x^-;Cke{P0*LOO(I;@K{#&*)RgD{=fWKAFSCxBEczrIW9C2M*xKk_E6D8 z_QV+Mx+Ckre+Xh&Pr#;tC^P}jlJ2i0C@c=O*$;^(vXj9e*P4Q){&x~2290F(=dXZB z3>L*!2?j^ltO62?WK#x-#jOqb-|?`-wVC^0Y%M=);ZV5EQX_G=wZUCm|EB^Hk6gPA zn=mwHq0FiNLNZiA4G@`Pci8M6sH;xyL9Z5xH3z z7RDwk3WdUKt{8>F5!f*jk_|(nsB4%1FS$@?G7Pnb`6l-SK&f;)5wnA_u_Un$v zZ72ga8cVQVD#T%5UkbnqxAyp&__(v)Ay)CM4{dE|0L7V1IZt+ThobNV6b=f98(11b F{|DJI>YV@p delta 5176 zcmZWpc_36__ipUQR`wd@+P9c{XCJ#P*;SM!DOs{KLr6kywh#(e$Wk&bk|Klbg+dEb zN!Ex^(PAk@^&92)eP6u)%$ak~InQ&R_q^w95-KSTRho-}ovjeeoelM!C0TCauN?Ec z7Bn9vFw%DgKd3AUh!5OB@;$!U{ER~H%g#>U6e+^Gv{d81+6vX3veHilo)|K^1fD#Z zzetB0YO8;~e|K$G`tF5WA4+oWAbEFNC*B?&|Mca%N9C+-xQjK{T^B3e#n^&xy>BK= zewRaAp|~+r&N$q2?D^bu&`%EYp>31z2vnywip0{?R8xj*qG!Q_+YceTAjVEiULn7@ ze|LV@KEra_q(V)yg@dpLTRR$e+@-+aPL=m@P8E&gJ)h@P92Pu&97*G=Z{loE&peQ( z@S)C%*`D~g%SrEbUAFEC-F?4%JB~#BqK&-!e)k&u1YURpKN#%1GdqDDs;>_F{oN+V zW*qM5yl8b%=VHSPpS-=>JV$L+eWueMib(hTxXI}4TAcY7Z(eDE*447Nn`vs;an$Zg z-tUUMBoF`QUdeuNPZyHZyY3OCta+hJff_w@KkNEatRD_YA^Sna^EW+8rwB z-4S^)T}rX~pw%TM??A5dE5H!OFvK8^t6n9wQB<83Pl7*7oq0aa6PoqY`UAE6_K3py zEVmcsDsu!HU{0!)tE621bR*H$ z?vj$D8<(%BWl&XNIQ3;#mrrr1URK56L{{Gc@#wdK*S@kBt4jeNTfggHjy`|aK_Z$s zw0ACc!>~M@={~$K_w_&rd~ttgk4-KIM9@>)V2io5`%1zHtnTcO(c3L)_+@_X?b2vn zpd`0~N@Kp(T>iz1M77M6r^4IB!8}{t#)R>0nqM$XnZ%Icdny${%1g@9$%hp#i*3&Q zP<4QwuyHR%=fnLRC+m+KO5@)YA1Lr=5O0sV$DXwZV7w8>`pPKAAW$w!H#NUW~a?)_a2p$;js$1nE1SL@p?gnBJ}r2qWs;BH2vk7T32 zEvx8JmtiaQZG%^Toc`L~oTe??R*4szzM*7<^7gGP`oRiC9A&dBrj>V>mz<%-9M7#` z6|&#ngh6Pv4N?2vTJjax8cAABv$ExIsDQi~n3bY%j?iLEr|CCFuak2A+JI5w6k>}= zc$qqzcMmPZmgU_No2wNmwuNKnpmyA=?$?YHM}6JfPrvAkisD9H=A|~71+%FQ@U3rB zF#26RTG$k=ZxoR0Otu#t;L8`T3;y&>RGVkLEajCwxr0>%oTW70*IX_K<~ttjgsl(6 zaz|FZI+;{~PeOlVlM?sj^SG%W8-|k4UhtoCDEbVX-0&!!c4!kT9zjhfHEkYBm3z)ER{TfKg1uYoJf~{xT*cViW?+R# zeJ4vxt;~;z0tcR&8=t2a28Y43&ueR?T{=H~bQ$b;y)-gSzUZvFBT9OEQ~Ik*7Zrb@G{HKK0-TLD?Rvl(0qN(%FJ9%74!leV5 zF;AO`+e6ZBxiie4nn?(Y`8g83=d><;EF{A%~uw~qyH$gmRBmI|GSB@NBEyXl+7Pu;l^=grtmGb=c1RKV^lEXeiv z7Pfa%_-tz85VK0-ig~lp>9U6tKdZ}waILwIuJTJqkq>>X^W}G)3J!yRuM4HyW`dkX z<2)w|Y))h)RIP)KTercj8!sn1(4E^Z$+tP>v`A-AP20+}j%K=;jCn|zFNN^v^#j@C zA!fR9WmeA!cisp!DGQ&1w!eYYE|oFZKlMt`vSl&inIdoUgpKylHo*r>QfGg)M5!f~ zV~}$v63z&XOkiT^f);RjvgHdhE5A{>O^Kt#prf3N5ifZhcewSwg@nSGuvBM<3|U!U;Q1-S<9B{I?|i`47|vXM%O@IVF`PARS2(}Ks6_1aeZIhsKb`gM z*pGA(P*+&vFwgp ztYkwMtfd7HN54pGvj}d;P2O#c4&PDfkE$%5T72BtpVIIUGMJ&~>Q=~ehnrnbS}Zb+ zKAWtbHtSLS&UK8hAimT=5IF@@iZ|A)H!hf7ch|mA$pynf#m3G`j@53i2h<}sT@EJD zg$rdZB4pAQAq(?TAK%ZdE!oIm1PMChSdOr>d4hzNwhB}0ZaD&n{lSUdd!zM8FCBsx zqg{Kh2|3&z3kRy6xMg2IjXTKh>zt$z>U_?7q3eiZ|4&|MFe~|Xu8=&pph<^MsseX# z+m$i_7nn8C%?Fdm4+VIgd?oS@GRDSnxqef6)Voc`df~nIMf4-j;t8+ro!@Wp1$k8p zbCp21Iofb(;({8<^(k73;ER{<7!wvjh9SLu>zy(ja)TQwomLND$yLi;q#ms5zWp(_ zCLOIJxmBtD)ird(lCoi&m+z$T!xrk)IfgE-eg~YnM8CX^8h~3AJUyS@afHC_5K!iq z^Qxfu!giOU?OK5eW}bIvIJ9c+d$tNHn0?AkakN>~Ao?u+`V2O{Y_7e%eM;eu|5l;J z7f3nZQ8TO3hQY+i9zloR$hJ?vlkCG&Gm;gk?R!))RL0q`YtKF?Yv9r|#CX%R4r9*~#ba z-B&nxHXzAb{Ig|QfBsAc-~8mLl%g9dv$d0Hj@EcmD;(PW<$shI@gVbtBs71M4FmYa#YSY%w$6N3`xHB!+~RJ&+Gy~ z8?!CeeY)$6QwEMnzQn!B5kCtS>fD-Fu5;i2p;l5qHB{qKpwrm9_mg+HYAS8{=O%A< z+0f}QoxJ~by^nM%E)@EerE}s0=Y_FTt<#2>AE(uZa3c)K7lJ*ZqQ88z9Qvv4_a(4H zgFnOjXND_e!i{#_sy(YVCu%$C;4qTGkr5|ztaQ(=k^P2^4I2WpqcVU0d^i4}{n#D9 zy^?38n+tE})3y@z*2s0wM@aARb`|i71-)u+fp5 z9goK?(}j0goKb*y`g>HuBQu)F504KOb|+p~^lKfjcTCskNTL-L-Cm!Jtt~uS+S4Q* zww3Lr1bklXiA1sRHP?v(A(yiy$(sI$OheQ;GfLEWGw#UvH1A<<$+j{Kc+#F4!~Zc{ z=V5Y$;6ZX7CUr`(7+Ct8YKIu@7jo;HsG@Q*=LPtyojBP&d(D`txDEj5|8+~CRU9s9r7L)wnlocI6 z8=rq?ITMf7HnB_czx{RtO=$6VsLPA5KZSz!hQ3B1Sw-d+8ATK_DJz25%cdae@}kT| zSv>1j6vPli`7uOVQ54Zq7(=*~L=c=}H&Uo9%ETxoqY$#H2(wl>9EGH+iXs-OCJ3x1 zf?QVZN66|T%rvztC?rxtglVo$LLtYnqDZ>N7{UvRAc@!rq?q+91*V|z%$r~&n<_vg zps|78ApnsCU_HDKc?AJjx;NbeAQ1tq0YIVxSYv?9vMmrByaG2GdkDaK0n9MG9p`=v z0HU(ydKZAT0&w78%NoEze=S=8w>zXAnU6LjsNc;|cgvm$1Sn z03iHf17HII!U~&Z0SGI60{vaxJOIL9PUcuvtv~2!Y>-!g2kXNMgP>!60AYo}QGmF@ zAQ&L7w9is>1u)bjzz-ne5i`vRq(yrG;n%J~VBGz|9=661bBF&L4Zg>>%LkwX=syNTBvDwzSNX`$nm97FEXTib zWE{9AmyE;VkPikF4m_UKy3#gz7|+@vt1}5W7Kc^U@Kh>GfPWG21j-szB5wJ{{)q$e zc=Bq-AfC8<%OK2({7<_eo(!%{rjXVo6F~Cn)>sNJ5A46d1QP50tnv{d@-i^%EaNWY zvks9!{U7Iwj{>b$1|&h$wLTJ=HGIqGzm1Tn&}wBsGDulNh71$IwSna2G5@z9nL=HC zB_IV{p5=ezC=hEato9K>){9sbM<)OAvHJfnYbr!pqXLycT|g}l0B z2oJ6W!9$eQQbR-vbxk8AJkn?&B0*mJy2*GDkK8qu5~l#zU4DK+087>7&1^>ZWUVGL T^2Jz$k2MP*T0z0s#svL;4O$|5 diff --git a/plot/validation/nvprof.simple_matrixmul.input_id.pdf b/plot/validation/nvprof.simple_matrixmul.input_id.pdf index 7366bec45db9ec991cb6176ddf879886ecfe40ee..4e4180c397a8b72132f8cd128564140dd4297d7d 100644 GIT binary patch delta 3136 zcmZuwX;@R&7JV}06A+3)8ALG5Gu)fp+zZA|^=9s>r|&|J zdqv2%pc5y38*-N|!lLvUChGG|S}skY2Lux|B_zC`*u3lylQIrVOeC>v+kUpva#S5i z%HI7h5gYOYNn+l7_0_%k6@BS>l4)VeqePg6P``B*bPq|b3RK?V>y09?m z`T+V&r(6d3^X_=sw-q{8kanEN?RTQ;Giq01l`A2$2|;>zCt2YFx45?6si|_-nPT5H z9fsrZNEKTNm7(2kM|F*K-0BWyZ!{Ti_N-Ts62SUsju+c*r~8`_nU$ffm0_fI(wnjK zX?bxw z#^-!9Np@j^VOQ3Mgv3sp+qfsOR0kUf6OFlDHh)aZo-RqpbivtC5q71;?3@|60`2hL zajL?7g8G|1y_=i?^|kLmwLtAas5{NJ4m~r!nSA3y z8k7&qyfeQ2g8(fkJ5$+tKWA?cExprA96ZIGq_auvI!?h6x~tf_Q>uyE{KMxOzp-N3 zw~&{02k(Jd$~5y7yoE5fd_0ZaI~kbW;e75}e>+qo$9V2vBRTnnMP&33G4f(_9qClE zhsua-`pk4wM&v$3Z>hO&^!d%Bao*SfVCm1;+6N`ml%PUE4Q#dl<)E00-<7`&3Gg%0 z5%{Fva&;xgR@gk#KPQcVml6IwJB!S`Z*3ZBtT2?He&UG~R*5oyF?TSv-oMwQo3K&x zt|q?FBIIy&lJSOQ+QkP+v$dfHM}7&ceQnS(_Q{K>nK~4G3=ur{pK0e~)|anE!Eri= zI6ha(8uRKJ?}X~MF^o=AT?Wn3<;?@FG3)H-i#xs_&tlHz-!~2T$?vt`f1On zHyJ_r750(*i#>>HK)3sLp6X+zv0{h0Ra6Fw*ve22-z)X8BTd=1DuRNptv(Y_2XM%;^Y4ul7_6)xyd`Hyzsp zt1}kSY8G(ERIr-AHOZhXFF{u$sw|9_FO&Kz1Q~gzT)1blIqX`#`u^y+GSjPmT6ue; zet(he(cIbd*21;v^cLl|1qZsK5Vc8_e%WzptNqVYO$WR5=ro%Jn%l@z;iIEfbJT%V zlu>1k?R|wCHF0gHZoFy~Us+Th+WJMG(y?XKzxRwnm(Ni>Y(!R3pcKQ0X6y6*36tJt zCvpxcyu0W#Rzw?s8FjbwikoXu-2I^QxDq|Z`Sr6C_C=*~in$kZTKfmpj=KRVX~sJK z{mXMU4}#?<#(kc;dbta7_nfLlNtUwahZ+sv^6cI5-zQuMIhNv`B%R`|G+is&8fBMI zGuxQXTGXDQxQ?pdt-F5EL5glxi9U4l0z#AT)bZ?+VVuQGen2a>t5(@CcT9>LX|ihJ z7bYGZDf0{5n0H%P8ORgMiqo~KIUDZXwW4Tu-S&~NcED3HXIbrb8%q(A?oXN+PxV7h z&l)O9Fr_2T?%jh~RK{f)O25}qN*X%R!vyS4yqx=}S~@8Z(7yEZaU0>X0M@eFE97*k z`ImSh;W}}G(Fslb0fA_@H{sS-&QPDANUHTVmcodmiSrH=@fb6_#ayen_Hb9wN0C@k z$VAuXn+FeH*f4*rd2EcFzdYLJ9clmMh`!b8VCyT&izcOV%EmsA>cmA1Jxfurz4dj) zcOxnZ>kp}>g%P*v|8pk+|Kxot0%%Gk`;{3!uh;L3j*(S&%9AJTnTeNEH7q7*XH>tF zM7<&Be5`K|8GjvICud*aC%;D|GhN!^<}F8wfZ=`z^TA&k9`AIjRiTYAimO6U^H_6V z53Q{Es66Y%gbXp+tef^M^4Zyo?K>{|0uk=9!aZx8%IVw75-dsGr%}g$L9Eq{ZM@|w zYmSaDp;-p2)@CxVZ|a}PQHWLHIQB|db$B0}2xn$v9BQog(SL0@FW#}$Lklc z@b6I(Q!dWc7lIl?P3nDpuW!H2uKdvLy*u0|nmC_d*#N$1u62rYS6hDv5Z(>Le>JF*AH;<`>A`2N%O*;CCM+aIexq=&q`x2|sU zNHq9l(CO%?V1LTzQ$zq+6vScx%$EZZHoQF|DoNM{7R;ELDS=v|1{PdqbXdemOaJI# zjHaa#7Qo^_5QISpB7y~$<_-u1z!>23!kom2emQ`UAq4Sb1U?>&0|8zhjDz`NVVohJ zhsR9>f58eQEtCWy5H5f$)DRqzmj~i-upv(@hy!69FNTE)d|Ds~!J}ng2O0@%PYx WFa*N|Q`Xj^Fcv4Ore@`2E&5;TE63yj delta 3404 zcmZvbc|2768^^DRW~9WF$TrsDTF&e<6vl)Kg*%q)NhoXfY}3e+u^VgY%2FZSL``Vy z*%A{eAyRS?GK5iCF8$SS=6C)$uk-qxXZbwu@AK_QqVWRdwnU|-uzcGDvl3;7(c`!Bv3hK@#f zVd{8TpC5|JXfidULRTMeb$5St-I`f157>CPc4c&A(|UbP(RktOP<~F|-l6Ea5XDV$yEVHqU%0a3Du9rW>`xx4tgD4w1o- zzmn(96oinq=N(Vj%JEJd@7w>l7b0_%S=}J_fwqNOzgg(&UKKJ{OUV!tcqQXEu(CC@ zzOdeCo#FH9L4foy#7N^*Ghdk5qg`$Ol>K;CSTugtM9x@c|KpH@YD>yf-z1ei5ch=T zDc`-|>s{SWEH#NkI8>B)O` zDH;zM4{f`_RHdo^J4aOy?#bjIlN1NZ?L-SD==63QHjtObL8Ck2>Vrcn=~N6wI$NDU z7qNFaYV~KiRe=(?ER(W&x(He zEHLRCyPTjWTev3Zm{=JSH5jM_{ zEUis{?tOi%xTs+8T1tfg4Qdoh^__my;l7eaWQUbf1eTp&HBZJ2&A(yNv=SaM{j)#=EOAhAB9e?t$Zs7Y4D79?6t zg(V+a^*%k4xBy94BZvms`4nH^?1!aVb&m75pqew&UzQ| zAT{I#E|c{3`SWvZZ#$SSIO~41NA&l-R1%yo!nc2wb>!b^53W-h`zs~k- zGA2%5r(1z?lES5U@mVHhhJmw!XQbImir$5DFxb}zK;}&YTDtcUM@fx4DDO#r2F^ak z-80#5k%wJZlS>L-1W_n~g>Dd@7oTi|-`ibQd)lcaKZ0SPY$DW5?FxA`g|g*`ZRKq` z-3QpyHI_cmV1Cn3xhy*cRXaYyWncB^_Y(O@UL&l2Kj>|By9V-hLe6%AqCNO4x|roG zt|3b>CRM1~Lu!0!kmh4gPKFT!QB~fp#@AxKFAf#;U(Mm~KH&YJpV#31tgC35w^r}< z#F)cowFX61XCV;wTeTNG<6S!g@5>?Gpu6yH&(TpGdwb7o|6MJ+A5HV}heZ$uDUgvq z{bwEteBtCFfhT;lXqB;?QY+^h&ga(#>7|t#^>T7n>CqoN}^_T<$8U|+X-amm!@IEqHA)^cWvT+T=`_SHeo@X zO(9ffFYDWnI_|0HE%jEoA)?h+H+3{KWv6@I>7Z8GLbla`wxAHGb`an#Z$7{zIswy< z@1?N@OK1xPgAe#ihH_`~SaolNJ9Nvwo|wZzqe6qLr~XumE3#^T+C4hspzeNen0}~k zh1^?fVLjseef&#dzvS}LEZC7`9t`DeeNwV|-n6Iw$G>!9**{uPp=#SC+t$LPu;`!1 z-^9;`>F$+@d)CoB8r(V9nzo{5ZZ0V8iG#G?gv3j!hR-_+-E2miyb_A-Y=%Y{RoH&9 zl$79AocxiW=pHpd{fj7DJ{xmKI@;P{M3o#eH!Rh#VsEM)`BdtYSt?U`fKsk$(+XX* zAALOh-q&^$mH2+7tfncnZu50$(y-kTPY>rw(>$$73B@ac8mNrXm!H13Uigr=dsu$z zLtap0e0L+#(|CUl+1*N0T4U>NP82KFUMQkz&JJ|G+3Aqf0@2OMNo?`xx$KNvllAw= zPEND@B~HRr%sdYgcvHs{)6nTN4gCu!_Un8*_+%z>WHC)8H^8Cn{z6H@k0^hS5Y@FG zw_Y(Dt@^ZrV9c@h07M&ewc<7arx!p6s7>vQx~t>_>RGN|Al?GBEP!PRaZ}i zN2hzvR|Wpj7DP(D(4v^yw3Q}Oe2WQN+!*@VohoJfSFZKRtw5W&!Ny3OuJ+dCr&Arz6TfZ^U-wd^odg}>Lm(0OY7dYwmIt6+ z(IafXpXrGU05~`r0ib9jdZzAJ3|1D{d1QTL!M{Fs{-L=wBv%{;gN1X(VXz3UTnrY4 zp`{Y=yjU!j)=f|(W6>DSJR}x@=hDy!6qg1dxq70oSk5>AfyHzB1rT@)Zl@mkw_&%- zAvnzdNFqu{@s|Ig{*(O53Rcs%!d0&p(sZ~%?w92*Yc(VV8?Z~*nYw%s29;BW+vYtP_V z1edmbRk$?-o|7OPf%-qWXbk7-!4X&s?u&p%Avp2F(KxQXg>SC~Cw@2vhrrT83=F|| N915(cXmG|5{2yCkZ{z>~ diff --git a/plot/validation/nvprof.simple_matrixmul.instructions.pdf b/plot/validation/nvprof.simple_matrixmul.instructions.pdf index e1a7dab1fdb7191db79c416cf7628b31b6d2f1d1..aa71fdf6c97a4e9d14986ccf01299c44a562cdcf 100644 GIT binary patch delta 3554 zcmZWqc{r4N8~$V;XK)&1H^>s2dEc3LHfveNzC_j#Stnc87-c?5_H{amW)vx8Dj_5) zSqpJc)+3QRvPM+r9ADqab)Bos@1OUX`+c6@b3gZUzt=3Ja^)p+RWrC~Zq|_#c+r-1 z=jl=TJzOql_p^=ecRXbc{0y2|5?c@FtX0Njno*a%!dCz4-mlZSP`Wp^;f@vHPVza|KB0Y9 zlR1ZS3GTIJPjXA!lN&>An0P z`&re@lWp!^gN^@L4i^-VKKL8l9j10xqLH9+Ep*1owzakEgNHbJT@aN*=}QKjTxa*A z@TXv3+N$T8s8pwe372~B)jGp11HK)l)vOdVk0-d?8(N=O=&>zdp)IV89ba55`)i&> zEpDZv&OuHYd`@fEG6!cX?wW!2SBq%MxqP|#P%@u=WJu~_cACZJ>8+TrVd<|1oXmq_ zR|jLSto`P4dao+Zu30plRY2A6EK@~YtR8P@{CuPLgRT8n(9Xy`f8E!QXG@pTq_T9= z*V?s1uUNqLKu}t=8Z8Jm7()(HanFs>eB>UE}8y@H0zCi!gia9n#HA1`e?D z1Sj*5SMy)VoQ@!uJ#d_|+;{$$_3#TlGgBLlpWg+qODCCV2)YgOZOV&O$lb7(>A5}d z!mnB7)gL6dPbN|YzC6P)hd1J5lb#PuPh}9#K4hH+x0HS6o~Z z>P8rh_KWc9yc2fOx{8v#5sTE0|9LqFha2CkQ;!)udawCp%;nDTEM3Dm^95ET99{K( zQEs9;!Ao+O6*=Kmddj{1jNj!%1_Mw}cB_=rurN=>j%vb-8(ZNHBseoV^MbF7sgZUC z3zJyoRO=a$_=zTG`QWA;BoJ<=R3;EBk(5V5w8UkmM|pLb;92|IA`wEht?y-Cl$*4^$WKAdPB}yp74_3GrmiZl-&Gho^rf)k0O0Pal${lK3^Hw?S zHoc;A=F{d8{lOAFg|R30igjDE7F9}f%xL>B%$y!dagH*rmo6YV(Mt}##y#e#e2`zJ zvm~LV=|M}vZ}3?8?5cpQHt zT-|}r|3m9;wS-{!+O>r=V-2AHZxe+s^z8M!?k9R!;OiT+; z;qAXrqs(h2X^rZPbJe3p@#PlI%%+)|L&t7QC#nV(ZpkRa;h2$e*SkW4Cxn*j4thj? zFbuqNu`{gh^Y^z|W~b~eVqJ5|<0WqjZdBQL44P>@5FIgz&$%@A7#xJg4Sl`qvUn-d zJ;HKU^S2zF$u{RUQ_T^rZRbp^_FFe-nGf3~b{?W#d8K1jbke+5(w`w$!G%I~hRjzY zY%eXvSLjJ3CUj{C*2v>WhW@>K$l;v& zu6pDct;?#EWxD9VTgH;f_P@Xmq6PvZ1yb!UChtqAFsYGucoI>zn5%6&h1_t7XKid{ zhwk;&3oh1>H+&6h|r zzZ(cX{2{SN0pP`>EVZK%N3^a?N?MCNJaw7g7boF@w zjf{S~`ippnaOKEEY-Z`UmUolbS}sgOFT`}xj#>UG$7*<9*m&QUV1Qaav~k4-oxS4Z z1!cR$Ou81^7K_mDWz4%3jlOQ5P)`zn#5g}3XedU77u;R*FS#Kus?_CV5kfuChp`2f z_wOy6{c_*uQSfCS@}oCA4e2xc>$7OZlqrZHDrZPiR_X^8>7Uoud%J^|=^-&H9We<< zwEHIO5ZtmQ=5wA`rvH)mulrS~`s1{kxmVkz7;dqw#ge{={`#TcJKA1`tO)-}^$qaP zmQoRMR-DS)_~%o(wt2Uss&HUIy#e>QoTuW8!%1dgRtsysBx0K z6W7J&n4jJtz+2ah_Kc=@DO^s~3)h^@D>S92p8e;YYWG>+mXtZ~_e57rT*hEuh2HII zJ-kkYzoOnXLX_}67o(K85ucQ}k6-^*5U??rOj>;EDLcPuyeb%!gKK!7IGg96neIo4 z+|v;`uZSIDL^yqP!Acd5FZ3>-pe0hSmjP)com__MH|Fc5XMH>?XC6D_;F=P^WN1#R zdgi2!q{V@y=+Lg)D~ktgPK3F=TwaCLE&9c;e{%@$B{Mu2WXAO_{p?FUj$3OHOa)Id zN>N<5gcFsQ9FE8-ner=bcyHC76wTD@Fkd|W)OcS20nz^=3~j)Qkhju+&hxXp2IpsC zv#N}iHp74y<^zqRw@JM7ea8)3@u{QMtsS(Lg7-ZmRae|a`m4Tifp;u4=>sqs9*~3w zYFA)y;M^EIKvxt}qFEoY4CMs{CfUe10R$OY0jf!~)wtkmvv;7Tk#fkyy-5 zL_uP)J5dl43++U~NGxtA3PED=+Yt;MN|z$)z;`G>-z#8rZ=#M65*>qtcAC|e6PbKm zAl<~^up)Q>hBk`u^Nyl~g&KQD`61PfwJ`t&3xXgXgdh?~FE_9X#NjdQmfHsziiCX+ zKnTFIsqMx{9CMS4&-ct!xJI8jIoIjjtGG;6%`X(GVK3Z9Z~iG delta 4015 zcmZWoc{r5o8%~Wd*(X`cltb3$`)1~w#hPuBH7ffWGO|^c3N_isu@r+OlqDnzgCs*T z4YDUhI0;cmWhqg9)b%^psd@kSuKT*L`+J}JdG6;;isG1|a+GCqQ0z?nB_9egy6&gD zp0(yCV=uFMI%5!pbt3nVr|K$lMJ4(882Bi+Ih1%~;F>jSAb|big@8UPjN_{D;se%4VzmF>~$f2*W2UPVNACf-? zI$VMp?+P*q)WD@VihpmYcc&xA#IDp)Yyux#AFS?|$L%C=AQakEx8J-yV{t0nggSb# zj!Qm@Au&|E5=qM2X4uUsZ<5UN(kH2{)gH-=g1vC`Tab`i8w&Nbz*7#MvzoVb?mw~<;(cfz6Wi~l#!<+9erQUISn-Ut@mX& z`1}YMt-J4>hfc~1%!z;$lV!3=-^q8X78$E}j#h1(`KQzT$s%fA-;<&(^Mc{Vm3VDR%D=#!OJ?VYrD&hu^QQ}fYo@ZR&BdOgC67v;x2 zpXJKmZ+&>goRW6CSs{X`R_f~JdeQzW=JmJf(&hRl2FjC!Fq6y>Dhx-wGoP}SSh;*b zx$|Li^~U|U6NV#kNdZWk#Og?;BxmNm4Yg0cO@XIOv!ZI+ocy=1z|}s11iCZNCx}pitIxVr4NG+TTq~Oo%vYsXkl1PsQj;1&jY~3Gfy@%!z82 zpv!w;J}_)%w2qc>!|i#8n?{@Z*SOC(4Y}NE3fBkIJMV~NIrnUrx#G^HD>8JWZCQFQ zFod%FtN3soiD}U{F?=wDh{0tcx zw63-o#uxs@c@EU9ykr&f7Pg?fb?5B|)!S7=cgn?>Q+tn481;-tE48(czD<4$+M5hf zqW5%ORUwJS$(%hT6#jrJKQ_01xqsCA=rp~-X?Vn8)ZP^~L~lr64hl?wiD|;__1gwK zJCUmGtSel*a_TA-`4r@moylcdN5bV`H<;qZ!W7eSpgCEw_g%u+J8s?g37JPjLRyRr z4~5N#lT?EZY|P!#MbeW@GaC&*8RaQcU`$}NcQ5Le@jkTR-K0c*`8NHl3ItVjYpV!3 zbuB81lv!Qc(z_R9Kyo-1{RlR8wKBJb;3g!jZIt%k?u-uli!+au!alT=etxjST&)Hjt zDy7e|xV6LiGGgu?G*3C1ql=pr#*3E-YoVS6?92)+HA?xECKF+8Qb-;4F=|%GQ*ocp zuXejI-Fg@o<`EgcW>G`j_?sp5V99w>;6w0Jup`Rsju#IdOsUR; z@dt}(yiAeC*s8|wIcqL*+P$oqvZLm2*Vqy_%zpT&iJ->IcH=xk8v*>IemjmELj_^p5@Ak=5Xe_X6>DwQPH{Z7K6SCC)uS(29Os zPO#lg%xWrERlJpaB|Uy8S5Q=1M`Y+d4x7KY$A?%G{(RHHz#2E>pE+`xE-j*2!DTE2DA~dROdZN%14lVxQ6fQmXm7nM1%i(+zgi|_UBiGZUA4or{l13TU?eYEL zCy~yrNavNgX>TEWRs6lU^p5t)Hf!^itT42(MV6_Li}>K1PW|J_8I9TVcOKib3<|d$ z*3pXTsfsn=hE=!(WW^QR=Nt@S#cZV$cf5O&L&hg3aZ?Jelg3Uf<;xdilH-@-yI^n6 zo*Y!@Dj8|{P~D{7n;|eYTPuIZ^<(@}ya4PiniZXjw3Yhs0OZtD;}(pL-^k>}L#_~3 zAqQQYRw+FflLmX1YqjN|F_f`n7+*x=z>l-!e zRk7PkWVs@&sC;SprLanJqA+i&Ny5w9hre!jhr+S(|XRs zH+Z1lP1WI*pxBkuZn3pq4!C}WwS6k@eK`FQhz82(tr~VxB>4&<(+=9pBU4dvyhHai z)YBbSd)tx*QxC+S_-te4Az@Tmlx`n$YCnJ5TeJ61_Tp2|JPw-s(ADBCp%q~h&gqcY zrn%cJZQsT5Vd}t*p+SnOmU@*<=!rw>Rg+%>vd^Con0Y7l=|yKtFYyG{z*Xpe*W}G^ zfkL9`E{b-MLujp5JKxdTQ>P$}C*E3bq7HMPOl+uFOu2MRvbs}#@4oqKH-`llnv6-F z?c}but`A(SCa^TickRt-Z0Rpt_ZcZM>$Y7zX}t2c_S>}2)%Y8}(ki03Hv zZsEg8Ud>>>_C-GR8hF3=JS1efJi-Xn>2QeXwR;>E>d>ZJ`?aslC{t&2^5Uuo^Hxz! zccQGed2z4gl%ttPChc60Q>o>(83C&s-_E2SxHii*@=-B(%l(*LVa#oE6?G?{M7fzU z>X-_oiH4|HM)tVz+W9l*mgXwnHl{{@ygYZgVr4of5BcMA6r)BitYsPe0PhBvkY4b?J&=`nFcjg)6qa})^(L%%!bi8;n2kkF$ z1Z`MinAR$RptVU}q}50wXm(OpXa-UU+IJ~3?Y$I&etx$a2hDvqg2vjjN~7;VY{OvC zv_{^26v)CTQ5NKWHJ}h0mi7eSdSSGvFD`r4ulX&*CgmV3-lw>@TrdIks3@>(;E8mI z^45M)%5{-;(GuZ8m#JI<$1C>}R8M;toLAvYyRXEP_F(U+CQJIB3=`e6?Hx&xydOd} zS`#kupTO5iCe4bK0B`(R0!sNg#cyW7Tm^(lAtkacM#=JcrnGjWgzmDiMV@-&fU42R z-23X&i`UR_gnP2rqJc$w^qt#qXH=s$o(j$K zFV@DUGGyOr>q->WvKi7{$k-3RG&*4g5S{@%MoA?JId)YpurdB4cZ*?iQ$ z{?7O_6{Su(7Js^YQpiVObv@Ye#qxK70K4FSXhPx$dZV^F_>S-8;QaGLOw_g^b7eSF}O_x|5=Wvg;V2$|1^UA^%E@yr?vx*3PNIk z7DLiIa9P_q0GgME_QtnVL!1jBfDjtM(mFJZ$vA*WKmwZw>5GK_1RxgLtnep?W7FXP zV8ik6IvlVu4>k^ur-f+}w-Eq5jiD(^!4udt7=Xxj7n(r8|9KZ03vA5c_nKG`^QR6# z6VU7&@mH^u|1=9=K=eiwzg-Am(D*-_1u$4P(*OpC-3agZf3X0bor4G)S^TTpsK0Tr zMBu;VPYyz}yM}N8TPGW-ZG^pf{!If_hz(Rkz1_21m&f(d10R++DpZfuV5SE?86WNw^V+ihSMbIGdXPzJy|GVo=(+~*# eIerj=*k3RXga|Zk9sM0R3<$wxW%bQS@c#iIb6Z0I diff --git a/plot/validation/nvprof.simple_matrixmul.l1_accesses.pdf b/plot/validation/nvprof.simple_matrixmul.l1_accesses.pdf index 7f0090eb8077d2f21df8c8e4c3ed532521adcd2f..13d5469a3cd40395e646c24df4412ce2ac13187a 100644 GIT binary patch delta 4244 zcmZWo2{=@38~$x!qDhtvgF*H%XErm8HA_iILUv({eTx)2c4JqAC`+F~k~Kml%Ag`; zO=Q=GBvDk#KkB;vzt6nqy3TpdeeUOZ?)!P)Q=7`NPGPCdX94YwQU&htwT{%3(;{us zvbu-nxV%Ly7LhODOa}!!#c_5Pf!8%_JC#3UYWC3n~$> zl>4lC?z(z)W~FCl){L|^*JI6Zrn-gQblseQULKLC=>k12OH4X}=fC!RU7yo_blEIX zyA@t|nKY`nSX^glM&k7|p=8)ax_dUPd43P>D6ui4Hg}1;R z`9!wRHtvU>583zfWQN4>I=&+R-1z0S`C}urCCu#v>7SMPtgeNIyfpRkYBUS%rm+{S z%c4cNFk`vv{;H~%1;mK=t}u!#oc*Dv($2))X&(>9yvn%VWg@B>F$HB@Z%C7V{)yui~CkV{yO0saZgEF8NEe-FA7n8bYOr%xePwP=iN`;F0t%%tv zj}7coo|Bc*gYtNenc7gc!#%p;XT(9@UpxUvIK{KseDuFsH z5e2i4kMuBng76U0iy>dq7<8ir+|%O=Z?=8fqC;Z8NVzxchaQgWWjqCq=1N?@JIQEm z>(}C_P-k^x@F#UU*4?~hcQ>D?Esxe58!~iB(BSBB_dzX+#KS%cuzTKj*QOOTAL5tK zvHivXhy$6yz2^l3ZaSB>x5r-@Wc?`62kM7g?kzi>J7fA#@}97Shy{9Ffj!HzPiX1FlEiqI1agzJ0>T1O*oMUQH`_trt0NSQejIS1_Y_sRKp7lQyCv(EAF_+(+l+U<99#uAD5z6#gGk3 z9H>##uU17 z0`+whg)kNMwxPAi%VTZ-c`o2H2_QU@`TAuIXtc;Fr{1tTqxKV%juX|xg5GcV{Q87W z(`eK;)^Rkmu#pzOTl??soP^lDc6g@=NbVE#g~`JS`ahW?l8G^(?5VO+u~Ed z(3|9CypoCxTk2u0_|bOTcrdtu@2en9g8TmU($y`>1b48qZ)d5cB>RV33VDJr6gRKZ zK;BWjSfWkGwH(|TZ?4(-mTFp6vm0K_{G0IEsnBP?IlKi5ANL%JOUIFJ!c(QyZ#M{D zRy+^!-Wq9`7mmujs4u!4hMJs7UL9;QYrbnDxGXm)D=*RGdp25CQH& zEjvqxc#=9pv`c$NZR2Y`Xmzr2bt2V_KGloKXIG}spX|HPxO-wbz#GpuwuLEPjeOY^ z*FTp#q#kOZ*P3O@QGA<-&5geeaT-l}D~C$f^%Lr^7o+r@ROXSly6ek=%5kz7FCz>YJQsk?i7^KJ9TYJ>=`-fTJ6uAA{TI zRWo|2R&@4)^n>><@qLM-KCWI~Og=jQ(vX4~v@~W+F2Y?-QM^wf;SM3DXh}cyS|f3s zp_ZSaHb5}Xju(Giu{w?h%G%d*t_SsKxt-N|T@Vy!usmq(7Dj#@UncEzL-DT9g398% zdb{*s7`ia~V7^u!*^(#cE~Qmjbr?H0IpiRe3VpW!o7+hEl#xPKk0QS2m_%vv(UAD8 zstNb_wdu-JHG=1)$2CNDop5Y`Mta!US1iW*EUD~{AvKqgX-K4EMT*L#<%alFN)q!2FH&D^ZJa@6LunTx24r1t+8DhnC$hAiaxtVG z!S>oumw3r3mnNAeUJyy(?Y8cje=j4UPy=LiJbjjeJX29gcCuZuahNaDpHH&W&l`I7 zZf-2QOhDGE);=^Jw5@iSksD0(1C6p^v5C zcXUp=zML`IQcZw8+Bp19)l(#$NUc{%9db3~%OnfV?S6)E#S zo3N>qC-%dLIuAl$KumIHWs+plUy3@R^u;cTw&`;gi{27=gdom0Cs@@V@|Bjw6t`nd zzE9!R*cv%d&{ZwAS;+qMp26d{{V>VAMSEN?2e#x5AC$9AJXMGZbvIIJ;=I9?dTR88 z4VWT$LtM_x!|+tyl`EB%yskeqJhjswM#UI$|BBR?+?GkLj5Gp zPaP4Fp%TQKs8JI9R5po7suY5sN|nf<7BHSz1ev-h$zNZK&|s;Ll7{USS0#Cdx+7GB zJ;M-K975II(}R2lfx{wHbrEV<2=y-7hywut7)bpt%ry>^Z7zy{Kq4$4swDrAAOsq@ z{en>SK%jv?5gCC-{fRsgX!M`R8-d3BiF^=f?4QWD{xd;|lLerP>*_M@&bly800&?& z1b`Z)dkh4&D{n7^9|H0_05Dh_o(aP-7&~}4*21CR_W|Kc%T7~UCo zs)?Q?8-OQJ1N3A-fVnRkVD5n;Y>(z&^#BwG`@7!XS}=gxP974CV%CC2<9~Yt|4K%p zvFM#fkZ7FR&LJStcr_*-2D!Z<{}PDAU~tSB0ngaX|M3|884HU>{tf=YZ~&7NEP=p; z;V}4}Q$XU>P&;=CsfNU={nhw)4>c5s!ZCq33^NgrXX4?JOvea`2MCM!cRB`;jBAG&0EuNt`%B>V{23tOU*K;HkJ`D% z07DBih9xj#c+5`606<{wfda4!I}ZqeQNuG~*zLci#6X4(ARwvJ28IwcMhzk(V_;?o F`7bj`*T(<= delta 4673 zcmZuwc|4Ts+fV)K9AnFxWnvJT!aTFjSd)DXF_yAr8M0HzHnL~u(a2u*oy_| zL`5V*(J3J*iug^Z_w&9d-utiXb3Nbd{w~-3Y{W9vB{0?HFi|e*`a&By+lFgz+4}&o znunaBZxzyw#mpQk`(sSlA*zpkv?RD3CzO?KhGqX4wt6~gF)E>L;;noIALc2TxGb=A zar@Kz`RvD)wfUjM-P!EF#VCKZI{-EAx7Co|`<-{9; znrCaaez-yU4#KkpVPkNNg7eL>32m%fUz>TVyf72;<^3R~r}EpkN{*G8?d|QJ4>QBN zIW|h!JL^7yMwt?CFJ1ch<n54Z*;#~)wzfb|`h(eWHBYH^Z$s|PKEXTAVJ$_c~j-BrSid{YQwPd-N+u>wxzI$iIU-G|Li%#gMp$()jOSlLq=wwM$V>mzXNBk$ET8q1R6G?jcXgNfimbD4E0C%9$%hh>jvwW@p z4%^ioMQyU&kuK-ad(&IgnBaIscuAbpc428c0~5L7MY)BEIo_Vd^ObZ?G!X-TXlWL; z%PK4-BYC*fV;gDMz) z#vz;?hbeN{2H?dWDmulJAXoZx-Ssn&!th47oE|y3&i>DGo6(nfUW{czQ&|sQ)ryg| zKl#uiRaJ|AZ!-=zm-b!_HEl8{;QlBv&T?yV6VlL0@?O1hpK3iqnu34rb}@^ubv|wI zj&hUVwT{b%(G)N1(Q4zf8PS+>eS4^3AuA)HVUE2oIldw9vc?5(hvR)Z;TLW&W`yg6 zkA}D&>t$5DRI!#q#t10~33m%B|5;t+3->j#DM`T1)zi9dg~ZU4afKJto)tSu7FGwS zSx4J^D0iasY7)?8oA2+E>7110BjqMVCe1+_7d}aJcowg;lU~^Alx=)B1~ z^QaG54;dD;gs+~tYtAr>NK)*@4&59fDU{nn{n{;q)rCGqYEc{uT&Y$J@;{YzqW{s> zBOKLFcy2%JEG~<8OEE26sCmP3|NNOJ{;TXyPgZ%A+NSE*vOp3$A^s5d5Hx-+sE%9m zZ4PtpCZnP4ro@sc;U1N-1WL%;m;8&t8$4IE%fNYe5@nz@< zbJo(+pR;JMKDZ{gJEvoj+G>7Y}1 zxHD5Dg!8PRs)uAQBcdSTQBaai=5;|$dfFNbT-JgrPhA->Hswza%Q`D}(R62UpjH-9 z>)~1J(OA)hJs6u*CZD=%;L|f8Zl-RL`Q&W#jIDd>ySRXA1A)x)wTo^OKfMlYh-4rkgi!{RUX%A(9eVrhj< zLCi7jp|eKlq_1RlYch$efJ!gWZfJgY*1Vy)(=v(@O20EXk`*lYq`NnjV259Fd943V zP(>?GmN~E5bd#)^{U|R-nkzmJciJCn2sb)+VdRAPQ|i1AMdV%`X*U13yTMvu%&B{& zgQAL8{hHXGjgm0h?^S#*xG}p={Byk>I$roQ^a(XwAu2I~?T&MrWrs`tiAdgVolTl} zVhYz0p1}yV>t|C+>+I@8dGwzfOIQ6U$ta^P(siYo-0Abo5`30N{kCha>Se!rN(CtC zXhqjvTm;45J#f?~-lSD4@LG8wtsF9atNx3d@krm`(N?9@X=di{WtB{UA#+fXx=NWG zrzU?+E$X`<6 zn3#}U?yBPYqxZ{1dbe#s>gOX?xSQFmcMyr0BWaYJnz2*Ct{WD^RlKcJ3A5mlwpaqvbzPG{`gnGBP1Q6#8%_~aS`d6=0gT2go_5$&kN|W*psK|Yvp68 zmgFZ-tu&LkHmstA6<>J^SDhxr*myd$Q{3yOW%@X7|IHw18x~(Orx}Le%-frDDQHJ z#J^58ufbS0REEA~c7W*x z=a9OUDj5#;DL3HugjRpc!f=}2W6IU^m0E9Qh`raf+=jQln=%@r^d;z@$8{$?Hm=DJ zn(8XUs4I#Z=M}$CJFB%A)WS=hCw$-RN}KTvmqG51Ddw|7ucZ~L;_ zg!1nXp>T15?Wwf;Wo~tS$2goC1xKdM#2$Ad!rSv?E@pLv1rUp6O2)}~7bsS==2^qW z`(v87A|Aa(!oI!l{hCo_Zs}e-Pj<{+w~U_e#J}DIwEDip(qr`RW%QZ!wN5sW#KVfsB*7?PSC-R{%Wx7(Rk)cHHAe0vRDbmjq z5{O%|4xk5}OKDm9({kY{s)Zmk)4bN#&yt*T0x$5OX7%KC+i{AnOG#MeG2WZ5 zo9|oznux8hn44w4axz40DJTtG{SY=Qtr%)GzuL)*_{g(q{Jk1sdQkn*My}#rXt%D< zn!&d3NAjoPh@PkU7p<+WD4!QfcHYxu{Xd?&wrM!jej$p}VjPwFDT_J1n?6^ZdvF)4 zSM>u*o~twynGSBLxVB0t77y)n-}$7UHbHm}BZRxf>5{;E3j3N&C9`VSfj;A!rxAK1WRAu5{)2FNI&zR1ow)HyGIid&1F~X#ydrVqz{fAqh#_9 z9IN6gwOcIUuuiT`l=C8M1;`wx)}CgjHi};EBGH7i^i_QZ22&$hmxC4i5<}T9;ah}K zm-wrIIp1!M(?xeG<~|?`;N1G9l*q?1r%9$cA}Sq%s$aQ`3l%!X<#b*^Pb0x0uPuh638(VK_W_syKIeKbs ztdZ=@p7TBhT~(6n7>dW**0zFu`r#FBSAD+7KQo+!SjuPnV|5=h*DKxKFxN8>clLN^ zPDjL&da$aWv!o!QAAQ~&zh*w%w^R`^_Ox;`C}1Q?VO#ed>&8h*i&gc^zxCE)5egb6 z30`;Ka=LSD?FL#u`}%|9%7wrgTBE4+QBf=ghVXR>fMIbkgrf`D`3el-;o{*4!(m|v z6$l;+1DQ~ohapHL21e6I-9D`6dDRDoBEkTeF$6(|p@E+j4B-qzBY#ycFf{5{z`M6dH*H z2uLgnz<@o<`eZDgLG$y#JYbN23M3W)7;^UZF#B*g48zSH91i=_%I~u{EEWKR&JYh_ zu~@M2j06SyQ~d83G3fyjv9_Z|j^+KU7JpZfqb9)tTiOZk^s08Iq; ziUBYH`XAuGaTpY4KMqaYYZkx|_5}bi{Nn#5n!(RFSRC%(&0jqtVQ-=U4$XKu|9t)5 z4FNa;cJDR-JV4x+0v?6kClHUL5cUI!$bCQpgL*G90fpKphJeNp_Tg{@#+&_l{!cU! zAp9~+`F9h;fb0cgar>es685=60!Y+e5+oz*y+)7#5&fIMBMd&qPXKzW@>2Hh8HvOL z`+BJT?}0&0*qaU#g+cvB^pl6NfxW9@{CueYF$f-36FCO#{{?Y4G!m3j(}3U!1c-!$ J`Z*2Ae*w~>xU2vG diff --git a/plot/validation/nvprof.simple_matrixmul.l1_global_hit_rate.pdf b/plot/validation/nvprof.simple_matrixmul.l1_global_hit_rate.pdf index 4846703c613bf2538976fda72bb6ebe64a49af65..545abcbb4c9e1989f8beb6b14555990eb1ca944c 100644 GIT binary patch delta 3622 zcmZuyc{o&k8&1fcFi3-MEMt&m&MaqUwAp1X6GF0%y&=6MN~gThVt0y?HCd}SyDTXx zdnh53rHF{imhBt)MqSr?mHB@Eoac96&vWkkxu54;r#hBn@ES)^DhFiWc50wPbZ~xm zQI&OzJYttuZPH@ca?)<&<4uLF@-4czqUmF?skT&U=)47tr|Icg(1g)dif3ELrLvG@ zuZ1Y`O5f_*iqG0$Y2J8-=52V^JE$jqMLeUgF~wYp+Ce zow=U*>VbUU?$8)-H$Hl!e5Osj(kfV^Yu^;H^|())O~5=tjoDx8OUrU{(P{hyRv}uB zv<@Bi4l~xTzps&!m-n&-YF7MfwQzKDA$go<{zawdr*D1Zw@as>wt3-UX7S2vCe0#) zw7#G2=v6$;Fx$_C+*ABULQ=ZZG;Azw_N(4fup#XveeUA%3H?)T^#azV@-58T75aBb z{XzDFtb_DLglX?Nl?9Z`M;Fc^*8q)YJ0#qqi^M4X2)gD{#dU)p+8UjFV*^R4{CSGH z>*<8A#x)Gjml0c`^KU8U3|`rf?IGJdH6lNncC-bI7fDKmmrjihUBedzb#+KLRUq1m zxI{t)J@+eiiHRM(tFkcjY9Z~#5!L#nv4itN4NBXJE5}d#_ZHk*SDxys`CTFp-%-*i zcOhd$-nSPK8c%GTOFYyvqAgjOX(YyYDgy+a@_%tGvm@_A24q)|gKncFTYMg#-?l{3 z8Q6mt348RmH!+IFskFuEf>Vp^{zl1+_A$me|DhBZ z{^leJxxI~4u{$$%hK0on=T%7N3tff2wlRf)<=SL;imuK5@$VSiO zK=;LuVu`#=KSPt94)vsgb>Y|T?oR7z%x}}Hp$;ifCojM_;!2&csaDTP9iE>gbh_jhCmGO4y9vJCXFLjH6uE2WJ55CSc3+ z871PSRybAWtAw$54g2ecSthPB&+u|l<<_Ei4+EJyOa$#_EDa+LXq9A23JB$Vb$0%c zi7qisQHu~|?p1o0GnFPNPhd{uyGj;!vk8Cky_0;p z;!6Vu_Xrd?jdVeK%x_E;=+^U=m|9;_l+Lz#n>g;x#do!$6V#%3JX_{Jelxe&Jg>~b z?AdJvW4dd)l|=6QmkCp~N8L?t3UJ4{>7B3Xe4e z?&SXa@v90w>edap9&p{i^|5sX1CwYL`ksl6za;UrCf!wScIper3`8&6_XPHxF+zR&RF|3Yq|SYD1}O}jW?3@!^yBuoT#b-vzWQa`Rb5$6@$`}Upk ze$$rxnWjc@BkxJXnJm@Tv93s0dJvaUQO&G2sbvn8ckfRx8-XTS&na4GDu3F4k zXW)n8q*M(3&bRbILFl1H_4+HGp3pj5XWd9eqSPG1*q*kOuWvg#q;>^8AQB`&2@FcU zd3i@Sa=Vpj_{Rz<(c7PIKUSesS&Sw;79ZSm+{>iOjk{*dU807m9w zIIUZ&C-|6`p-at?VqY|4koi2cx+J z*YwYm%uOA3q|2e(seYeeW`GCaT-BBJdX(p6vz^IN9!&m`&dWi~c1)!_)!DPQP^zN7 zVsctXwrsB)h7S^P#!p;wze$A(oP*bvB!bIx{SkVMH|T$&s?N47uf^5yfjNuu3ovG2 z*zdgGswrb_(4#6d#*+N8MnIt|N6<}RFpZ8$E(=4&_^Z~|q<8tgaz!{C|EKrj`}glC z42zjYWOXj|<>M)2a55zx77~jrpB}xv>(Gd8<)ftR8Au~UMkD-h`7onhh6<9$sS`qh zUENX>(`(U~dp)7^bIdz+QQqa1D|aPN`fw-yEh7EeT*DkBnpAm(1kPVnOK?{Dflr(Q9xq@y0clPwo?NOeU@pXo1F>+Hqyl7V>cQ%Wrmb1br19`xfbh)0as^KGEm7gDgQbq4rKbinJ-!qbn#8?lch1 zR)@(X&I)dv@ce$x5q`7IE-2S`o(r{vk@S*j%Tx;Td%wFTdLeXuQ{!`EC6?}fLG!9?m zxjvP;eR46@;5=!r#jQa@bdUOSJmH>uW%?Oc0S&9IwbALFVXrmfxbAiue7C4BQRqqU z&VN(a`q?0+okx6^O!$@FDT&#s`TV72sbR>G^Qz++c!!h{9JBKV+_Vc(3Q8q%z?Dcb zI9(E*h^&N-q{ZM?WEecjdfH3J!892$m?`ZAm&l5h2FvW>C^bR}A`qG$-lyGBntyqp zK@mZern|SNS0L;|P~=7dSb)D2COqXf+sMF?QCJierRm`xbP9!|Y+U#?JyCegvb?D*T#yr`%lvz5V_4T>{-v%KBP3 zEDjF<006$@!9E)S)p!`t$KLdb3BC_Oc#wcugAP&dk{WA~5k=SZM9EH`< zA9<_`NZi0+4Lvzd0vUv)>%He&=lZgU{ugA>lU`^iN_Efw*}+ zup|HfQrJkyKMRM*u(&=$^Aa#4S@ha0@yJM5!$?50FEVRGYO8}8Gt|`v56+)Nt?$6 c5Xo#y2!Q_zgl7y72!O0Qel*(PkRkv70Bu*C^8f$< delta 4049 zcmZWpc|6o>7r#W7ES04$GmIE(HNV-1kYz9!$r@R*b!}rwMudiJ*S?HBTlSEBxe1}N zrV>hsq`A1+71f}3s`vB0H*@~@ea`3moacMa`JVIq-o&u9Ca~1zuu!b@uY&IJc5K%y z=vVmhPo5yGksCL?2h!feys8+GxD{re?BSt3ZSmsz?&g8o!0b-qHr}rVRh_J2hMeJH z*^uO9#cwVdUJ$Q=)!YbLZ`+;w@ORaX);9No-3VJRtC2m8ni(sXOV2-eI4+DB%r&f% zKB^uFp6W|Htayo>j;5aYQZGHXaTJge+V$y%%vD>`wvkV7|2txAz9@YQ7}rYd=g6gD(0c#r=wZ`|VmldwmP zB{+hOgF6OLUjXzc*slANl}d+t1c`A9rw!9@c@%zG(^}DVZ8$ih?D3Kcy0{)T3T;&E zWn(}8wDBf~t|U_O#q+WS>ZL0Y^;0C*kRApzG#(V3DKkW|lpcBqA#=I9Waek;#@QJZ zM7hQit-R$#MP9`;4m;?c#)y$?@vu_bwQQ92J70dANG9?~I2rue>HNEqhXS;9 zk>y-x_|vG=h08%XGEikjIYH?AkdIzbsWH!}CnmX#O-D3~lVqi|bBcIXo>5csas)T% zg0^h#*9dn$E6dg2g=nGQ$UG)@ad3Ng$-IaZBt;UqmUqrM#!WX`3|u=2BRi#mwzNiG zlO&8O!pHe)=~%r1?1R0aI@cMSnUx z&X##oJAP(e`DDHHWz{ehP_i_Ddg=tJT;d9$;3R4sp@u?iQlrL53{gUTwoJ$DeP~6W z_)!yw+%deFAkW=D5<5sCRxI4girFVOWWl1LJi;nhHS%+u^a_F&4isnbzYpi+$_JT^ zruHm`8AT^I5cT?7+00lC#x3=poa8vH5gu-sSn+5s&gEl()*r?Wjq>CpvUJbYt83Bg z6nV{A86Dx09Mp4?!Sh?LpFp?VM>H|y#u{1T5Rrr~y{WIS4qgso2 z{0!xcK^nMRuWe{V8sf{C-S zyz{POcO8_X%igpWPt~JIQWz0hI{ScJ4ezklC5J(y$nIB|+>2y+x1^hm1(inZhFyx* zUA{hf<{71Hw9LM78I2NQQ41&TPiYDl=I&wYBhkXKM$ZMl;x&R4RAaT#T%uY-N-x?IeHzg;tn zFH-8LbIMyJT&W%ag$$iwDR<Q8?xyw=KGt4}p@|bnwkCBg)wE}sV|%9RorKd9_=jaj%|dQc9yC9UDNznp zkXcrgpXafDKuG0XL@vZ^ObSxrw~LDz^5lzLXf}nC;=T&DrUHcU4Q*E1=(zt{$xR6- z*`4=)$&0vP#(LuM z_Xyhkl`XgNuYXWeE?Bq;a1~Z}rE6$+S0&z77*L&(iiIi)F)sU9r%m9W8bLqzrNxm< zhe(fNzDf7A!H$fL+sQ zD)^SO^ZEBKWN6ywo$2c;-nCVW9^|EM6^^%SV>2_R-3Sz<&Kp0@`*mhBOWWn)fliko zhXxhMaze%~^`LEsTY{fAZWAT0kFTcCMgsMM!cKmu9+f;xG?o1XG%SmY8ZL@zBh^zB zqPp`sgfG8nx{k6qbV}voio+s5$v&r`FRiF&A*{lv8tp#GH#^NLsvk?Y>Glav1Mk+R z@0kCSY;lQl-_2RisbpF>G znqNmBcvWnEtE}H}HR;nAI|?eml%jm?G#StMPG7-CJ#jvQIU|GIa{THj+uQitIohDk zr{3mp5fxqZZctDZNnb5=_>SOerr7H={}cWRt!|WVU(>`mXh8u)ho+$Osjp9hU8LSA zi#5*XA91_sX;N@!d(z@yy_Cm4-F-t-X;xo%=iex083z)+zD6#=Rys&@dhc$NO-CT{ zuCmFCvpIQ6vfJweWT(OH#9WSOFF{+eh?OP+oYj0gl4h*>c~qDOJQd(mDd$?ya_7(_ zM=#~ng;P;{)TX65(RiJ$r30sFoNdKnb71E}g0kTXoV;^B_+xu$!u6QYr>ckcmNqur zHotz=o}nXrzU`&>+yt?3^w~u}0!zzRXRRgkx^osOB=@r4o*(QBlVQ8`h7fuu!Lu3s8B4q15{#Ez~d(0cwP3Fx5v?fGPw^ zq_Tnps4XBDYB5Nl0RcY4LKO!K96%xwR6B?y#aKT93g?`BB^MN$-i_@Km@XfS&7*tu z{@rews>T-QQBhHMI39MtoLu{=TRiY2>&s)5ETJdIs*aY~&lmFBCf39&c{>yQPqC-h zC~&2>NVq*TZxG8i(DZpSlpM~p5~R`}e}m5f+a{X4bi4|9ceRIKq4;*iyASX}IRS&R zaLQnmg1K3axO$g}=GIZ;BIT}EO8Re>|7rIAcq2B&do{Z}RK>DK`>IEMr4oMI7+~+K zdeY)r_oO)*R--0i5q{yl(kLcyz6L3JoxF~uG@}o?ehCtU7w*Q zGX(Y!;}JZDMnPa#UHu_w3rr{@kqB7S-#2om|z^MD{xKfUkV4c$094ihf;9NTu3aMp}}|ie;r6Pn$hJx3_vmE zMB#w_aR5RQ*v|(b0W3pZ0HFLU34lbv_Y(q06#PfvFL7urL+D?M0Z;(IjG+;XA?=I9 zq5h3A@-qw#$1o=R-Tv1xfJUR3Fbsge?2p4BF#A;oFsSdF@GBn%$21K93m7Xh1!56M zCb_VT_VyEE8N0!R;m}xWvZjF1zkB>q2;+dMOdJ-;q^%;r+>Rm=j-t+L3Q+d5!U2G( zRycsb?ynUNU~o(;%h>(>V#48YEHj3J|5qc*|E2?n;}}o+SB2pSIF1=ZeSfjPOahKT xGbceXO94lt(7!eU0{}Q0$M}crBg7z>A1-6uSSqiU)*;5jL~%o*T87%({{e2!biM!p diff --git a/plot/validation/nvprof.simple_matrixmul.l1_hit_rate.pdf b/plot/validation/nvprof.simple_matrixmul.l1_hit_rate.pdf index dd80eb6057eff8509ac91ea5da73bd9d195e924e..b3b66dc0f31fe5e8a3387c48027d92646bd923cd 100644 GIT binary patch delta 3686 zcmZWpc{r4B8%=01m7Q!EF@`TP@9blV>`TI+i2N$X5XO>(;te6YOxc%@RI=B|k|k0h zvhQn3vJ^>KO8UmH>-w&mdH#9tIq!L%^PF?v??flF#V4_qXR?8(O)pAT@z-~iHNvge znqcB{FIz04J?>gPJp3s2l}u+y?kNEeQk3E8!LIe@j-?Zhmz_;rTk@lCRkjAz&h0rU zKEZn+^J;w{=-u#rdc<;dC}pB;V`=N2>?+!!EDo*t?pub;>fW-UtmfLH6VWy~x;)dL z?gpDEj#`};X)Fy`2k$L9Ttb+go!JwYed)~S!CJn&RITJTL}_w;Rmd45Z(S9vv7#02 z=B@-0%oD#Eh(TG8$KEFAy}PN0d-blS)ggMb4Z0C_gYmWZ&rVk@ojC>hF1*J{yYi;} zOlFLMXY@Wee-!cQLUy(vVk*gK_N#SQ-@Mtx)7PR?IcMtrNm~XxsbwlhudXk?Pnp%6 z{Ti_T+I3w!gY4-TmE<`0I!W0s-fM|hseqG*< zsT}MZw2;d{b~L7|gddcP)vf@XjAt^t+5!F4s9PxKpz(0y1#fDW6P_kt^t|5;V zNa@)oUbpT@wKh%D<}K?mA2R=Z-JM@#Mu~$0P4HVjKKdO?Ne;VoqIdM8)F34j9g|zK;+GzbdZU3K&N+n+ZIiMIy*hO@vDxPY z@UmAR@3XCbdZjGfqmMmPtJyY+_%2*d-ORY|GRb_?#A$0xhWz}4QIChPqG_K_expzr z|1AkXmad=r<30Dt1rW|Bs*^MwM?cK(NHovRaw|Id_>1-96#knvPh!26 z&gmCCXaBgp`sqbc%z)YKveTsfc={rJ?DpnL%+sxn_oVzgGX}m<><^!=HXhuSeg&EE!Ce9mGCo8n{QFusvCaO*c=UGT`EbR?+lNeYuRA4ldd(-8Rmx+w z<_gYBE3egsON6{02Cdn;=fE%g{jmz{IcSSgt!d$pmWM6Wkva-V=3o?g!#7<)laF)4 zK3~qYGk|2;WzTv0tt*F7UP-(10J@<^i(Tb-zdz}IoxN#uib3z_K)S~C@q2IDWm^Ku zI+M9l)6binIEqDG6DV$SclEudJL+*gd*|tkDOT9-di63V&9gtvgRXRc<&I8^LAEzOTv|$X zeUTc~&CT~}f8h)KE68Cj!ZAL~D~hh$7DXX9DNwHh=xKyP;#F%-?kL@1N9bz4qw;8~ zdMyo22!u2bW2$XnpuB|44Nh1T4U=x0dr2rh8XBfOKI-y0T!(TcP?OL$q8D`Uso$q} zyYEIX9&5c`6!dj2am0II9qbx#wrYupaj)XxN)bwI>+XIRR?eMIkWxN(BUw%)qea0q zU9ri>MgRoMNq;Xh0Au`y%(vBo!vM>NSY0!BJ>f365Th`8l6mFcQ@t>aeKOOjn zxFd24%r2(P8Ni*nB&$pFTFb2YGiGV^M>Yd#@}$J#uH&K$q$S%X_wdn}IA_|nsb-{7 z#T2}nuK3}!d?kbmy<|8jDEvcEw`$&DUv`6A!Hl3J=zTUr5jA^Z^I*vx&QLL^gYq`k zHdFpUI+4T9xfAzuy8hc^jhL((AJYVr^`7Jp4moUex_61L6kU86oiG}_isr9J|7g$) zocTKAzOL~Y44BYJGY(c$i+tj0`=9x%BwT0g#_O4Akrv*>czT&!o|4P-X8AlJtVYN% zbnr571{iNyIZ~6V+g=c9+RAM*L3#F!6i6PsJFVR``*vAs$<=fU9`X1A)c({(eA3F# zgj!DA`?bVxG=%!NTQD_ZaiqCqHV`XOQvMR3qW9=-tjnUbw9r3g6o^dRKRExw0q4(o zuNA6mPL@EMnj(EFmfm!R)lmH>XXtSmF_MW+W8nUwvF;dpy>^okKv? z)7!&fTDtl9VAnbM2F==f|ECuM_@uaU%Hw(_TDV@^m%MB<{7jkKei>PE&co@tW<+S5 z;N?`I(4k+yzHNQndcANjE>d)2`F5BSJIw+r$&SQg>0ZzfUUn1?Nz2`L2sE`Yp>hIT zV;_VX8fniEiJ6qb!kGD=RoK``OHna(twz5=p5v)+;ZgATVTpfGx>89JPjw-A5|4yh zVCwVL5hv+79+Zp?R^%HshH?}Ir|+`yz9~iln|#Q zZL=|cWP>ABkOy3`wj&>Hz!>2N2N7lzecOlW>BYtS9DitBpGpegHT#ut&l8 zNEivG;_P|J2ZjcI|DeFosNaxx&zx-<2B-js0D@!_c^$ibur*hQ|NKsC4wv zvk*3bc1!P=44}xPqT@q8=j-Z8)jj7+hAHT(qL3&w001}ugCPR6H9b=b3duNcXThj2 zo_`Afi^DK zlL-@s0SL?(79cS5kSHvQ`5s6Vj>t$6i3I;Ji$oDn%)&@CawqP;gpp`~z-$(Y#sRDt z0mpI~3>yC@kDX<;pvK&Zc3^KDY z5rbjn;fQ~=gyvwh2V{`|kQjhv#2E=PX8-^UkVzH*Fsx$_V6n_22mp*=m@@!SjBk?p zKopi`i~tk?$uj#WBC81F3un#+z!I>`dyxPR#p(oyXZ=IKqi|T-6JvcI3<|{qhwGW? G^ZXCG45^R+ delta 4198 zcmZuwc|4SB8$QYs$zFsQSu&=0-+5;hI+!dC6|zJ*MY1G&WI5g}gUC9fBptgXdqkE& z3)xdc$i5|8WC`Wt^eu<^oO+lp8jfw)348rmxepSHLX4LXVnAarSwvTxrGwv6}L|>gsebano5PwaMHy?%`^`iG8%Sdix(TueqS6Paq!$x=%EsR}-+fm`G zI)~Aov{dHs-2IfhDWwy*ptk*-yX;tO4|{w}HnYcYU#h=&5#^8+KKGUY!z7Ti`{7b2G)h_}GsxFl2I!HFbXUa?7vyrFwRta;9?U5MX!dR20p{>uA}2fPhcD5Oz}xs->fclOnj zt4n_vZxA<@pC0I3ps)Pqpe8M&drxf{ZB%1YUq%?ObqVwwe{tudOM#&GAGLh&d~#Dm zppee*fwJ}M8+X1c!It}boo=*EWU_DD4V-hLdVAYgsyTuyl80rNZ?ABHY}e!Q^sp*mP2&xL`Q< z`2|;cG1BeC^)ibWT2sWbD8aOSVUL~M&l$VSo)l5)?t6Gk?xhr$gg38Oh-HBBugizBZjR7TFhN^RMnb zy=v00w;}CYt(e<&YOKL2fw_HyVjEH>IgTEq%>lJVq=`65{*1vHv4_Wx$`)K?7rmDC zZd7RO^Pb3N&E%H&7`gK*8aLy(Ke>$5VS4K{ox9FnKt>eyz(vg83%<{FMPE3LG$e=A zS|0?jYo4<2P)@uVuV`<|azcL0Q~zBm)gNFo+x^oa8a=d-Hc=8P(BUSIb1{3=~^BSr7QQi9v;pMV$i#!^^c{c%JWomd6BV|q{Cou(wkVtcS zK{b!W&L7DoRq48cGZ8s7r?V<)!aDDT490yp1UnMJfK0Yhp9q@*HER}?A zyvD3tr&<5BTqU@x7(ZcP$!;gHTR$pgY)maKDpz}%B|@* z>v?Pcm4q5mJK5qh&v+}}cV`|7;DiHCU~{!j@!t{Zzw%Y;2Iti%%alppX|k0I@3F=9 zCB$al!%=bFBp02rc??-XQqPK7B6qF=Z{axerT^6CQxP@Re|SqFSH{c|X(dp*vd>tu zdu^^3o6D8Tc93v-FSbTJJF|uIf>u}gKNw0+>30vodE_Cj7 z%Ua}{;VQ^uu0_LDGRE%S62_iDO_JM3@7oT06i|(RS8tcqDNc;heTsS{%Pig|l0Uo} zrIerZ(g)5?(+-^r2)?LifbXL}ToEKWAyYceoYL&A)zf|-T znW9`JLQW?y+FU-s)K&kwH+PY>69K#uAin(r-m&%60`e#5I=n3z@PVh6uFxCSa0#{U)| z`#P&=IMr(eQKOMIaQ;a0AAQnn>_`_`;zZ!gJ(c@XDOt%U+JrhXQdh6ALi4nu$&ryP z`TNotvIER^U3xRtC5@;aeEHammAkJXPP>3>M2^{on!z$iR=>1SQf+UU#iQZ3zq2V< z`nB|N!tTDl&Wo$_-Br6>z1TJq?fk|3NcAeWULKRt=tcV}N3dPcYSdd%zbEu<%6cfc z=tX0sU|ew4zCnnK&21U16>`xwJIa|a?KjHuN|>&uu_7U0AN%}=K*f*Qym9OCS$T)2 z$`y7$*_-q#uAr@Vvj48u&XmF5=+okcWG*l4&qJ9OcYnG5msN~(@#~Z=~N_tnw~6i`sN!J<20L)<)?i9 zwTA2U!Y7KPWqp99KBuYFwqP+u=iCC_Y$)Rz@~bwUbbD>I$XF*fB2kA_#Tm%O?l|m{ zQkNt14FfyOudORt#CA9Joony$_GWGu*}Fh-hjqiMX2x;i!`&X_yFZ z%GiDngTri!+<{>r;!X@Aq8aAE;s`tY05oya{_Xz*AQ1icyzoE70uUO@a2dkjb_xR^ z9C~vz+Xe(M0G1I$6L$&$Fa+?QeVhE72Mcbkdh7hxc>s&SZbo7UhQ~AXZElDW!$E|d zcX0$fgAhCbV;HQ#{~yJ8JiypTU>uMDY*Y95_ylk(soRARFt}e%!~alBz%%R^Ac728 z0EieYgAyVRy)$b7k%-yZ2Ld2*n~*L4K>))*0s=(zPWwTS(dtd;-%9=`1l;7p5?;3xoh1BL*?PRtUpLvKiK$&VV@RKmWu1H*g@1F_!mh0yPE(2 diff --git a/plot/validation/nvprof.simple_matrixmul.l1_local_hit_rate.pdf b/plot/validation/nvprof.simple_matrixmul.l1_local_hit_rate.pdf index e0b782decd21d8a7d077eb3c593c46d7b13b6377..7cf27f0c3c1c16bf83246ca79b6dad7adc941958 100644 GIT binary patch delta 3410 zcmZuxdmvPK8*W0@uoffaQU^2c<(xBTZgLx!=&~+dkXtS@?grzsD5gl4DOR1L6~2U! zMGR#l<-@qN%4Meu!n&1gDaB^h?vB>?eKFtfulGE^=RNQ9KJR-@(f63L)0i8%7>@hy z2sMga(`?mfu_gPfwuCpawM&t{2pv^-ggpJi;jS6p`dEAwkKfU7hQFh{ z*F%jy{NUZK(Y%FT3Bw!u!*_yn9(QvlEYEs&UMalRd_}5eVeH?3D;w};Bz$;ca-a-x11Iug|9Z+@=b zxz!|5j6vH`1%z-n-@86-8y(-bCmAy-=$y$aMmaeag2KR(=l=?LIXyi;xtLwT+4B5* z!J(HE^^1!|3Er2|2iyWp`HzfVoKpH}l}P< z>x(f7FW12%1@i+EKkWQ@v8#vPl*w@#Tz9s&u=%ZuwR5E%lBFCM(e}d)QYplEbW98O zgKtZRGV-S6)Z23FWHh|9;1emzG;!E;>~?FZM#Zb)oVwS7z$|)$N_gOc(wY8wg?5*P z;SLo~2hS6-3hfNU=DoeMQh$6ap+PZ4v%TV(hkRhcv%ZPD-sSY&S*O=k2soD$tUa_} zrw%xnsUK96v46znJpCF!ejKmQoe)g!`BX4m)`=;uO?g@0?Zf0ZCYgkKlOGBs1>wpU zef92NZ@unO{|WhP%eq zqo=Tf7?XMnZc%MN+>sr;ol_^LkF@^>Y{Fr zwX;&+JIcli3mdqMhJtBiF}Z-uNK=U zz0;EaN6psKLLrkQq!v#0GOT2;e?cu4^D|~~S*P>%*Nn<>eGUuFsb;dr>3;;lzGbl? z|6yuvXfT+z)yX$?&)6_2=8o+U9I%qUSTv5aA*b12?RZFwNmISO zN;6Ls3OLDEC{a3WGyELQ!*x8Bils*5zI-hTxNj_q7>);ka zBXz*2qEE_dCm~?07WnC^He(DCL4EyM!~2R3Qv_snVXZ1j&aH39}HuZF6Bn;DF@DSH#gzxU+r6V?gO!>0nXdI{aN z=jx(sHhzQCt)s~nw)dZIJz4v2o?sYiYqD~j2SYAAb>Gn9VHgst;O-nf|HA)le2qjM z;c1BWoh6e83q#lB(l~lqSr!}HT-0ttRB$o($DC~Zxt_KF!v@bg#QM8llp<|+rDG${ zE7c~VcZT}D_j;YtP?;U;A0Blnsj8vUQBuuGJ%4zna>h%+Eo#@TjFL~R^a zziSl}@&`NL+}!N-EK7-7b6FuPHH-pZL#MhN4ZDX!WhMKk!~UH6c(t)QXztUb!w8G% z=;Msts;EG@BYIF>{NxE62DryRl@fobTmZ8+2jD6;Uyx0d&?7$9rQ`j6t6n;drIn$Qbm( zHbr#Kcmf?U#-Y<-GWrO_p@~o?8VKQd8t}RGXf2{G0YD&x6~+vR0f@9O5O#22Pz->8D-e_I&twUoq2*_+F&jYOe{P}y zgt!s~#{&qm5=Z*_GXaDIfZr}x6DEGvAdG_8OyR|66&|9RZMFv}^lw6bs9sGM2q)hS^VOwdoM5ZSf4XGao{SEZdb=wAjM{X+?@X z5g?J!+w?dL8HKFy7zlN-qH99>SYvB8(>EqKf@R|y!vyqfOkfZuLJ&lT5QGMy^;YT_ z3M#My1EFOXmJh%Ju%816AyI^~*I;BwghvLK9a+sIgEWy`G6d1kFl(9^5hSCf)&ver z6?(Tik0_k!S`4PrgvPGs!9=Ri-qjcaEe~K7CTtcK#Sq%(T+SEGf-o5tPG(Iqb$OH4 zU^JR=DOO`d1Qx|;B8GwpO#ae5VSVBGKQ4m^jkMN1kc5c143fxH;Vvwn|4t1ggOEta zWZ_3F!XtA)Q6hDDo!7jOMip^}0xj?CY90lqtX%_;LIy<|q0r!ea(92BnhJ_4q#}?= z5h{6=g`Xu*X(09d4jqn2-w+5A*ZKp22yyMOAdn<7KxkR}nie4l7CODUZ3rR}|G_x_ zYX^cTl(m~9Tz=6auxQ&5LZOL>CPB2dGk{3s)f+|-3WLb26vnW9gTt6?tf?trye~MB p36Lmgl%3)h;oOCPEOT>gbc`=MCZ5dYVds&ilvyt#|#NcdchVYweXAkI72L6!I|eZrdQ`ieLtcr^{GPZaXqkATx>1=Pple{L}g`>5tWwa7Hw`9-U*LSKiR!8bj>y;j`x z5@}63G;!p6b<4d=+)LM&H>PU3Bi6M$=fI=G)bN%&glK!q#F8Qp|2Vs1 z%gpes6#Sz`nt__Hg`as}S0w0LH=WjX>aQ#XN?PQRZjYlklLtXThE1obh%_$joTOC< z0u~|oxB;1#wCB;D%1O6>r!67=*6=SRPo#IrD=SYIB4Ur_IYl_KKwCXIggID>O&Ff zq)B@2aoE}AFSw;@#2_f>U^&Gkaw6{1os04Q-p&&WX36VU{uzpUEAil?@~%1c&K5ys zV!(&#{Hm(g0`UsP!IwAsp6MC~Urf21z1Q&D5hZ zs>y4{dE6xRirvmO!}1Du)0(Z-OqG=$CKHs_2EN$NoHWjW&lc9ptTXHOsjKa3TQpEg zO4*#VSwwX64si#Ao%#7mwDr8zwwjI>9-E6hxXjYg&-E%9-5kr-*qDhcTG8An3CH%u zL4~?LAKaEbvP46lB;=~xv+v(dWoa%}#>J1Udsx~Q8`35pq6K;%op5*#E@=*FdwDRc@>sxBRXwl?rMe z=xZpq#9lISi3%WcMtzFn{-9M65d zM5fJ?8A!6{Kd>k>j*lrC(?6A}AHBBYTMVtc_y_PE&z3OYP=_5fpL9CMYiVD9*49Q| ziVxqY(oNxoj5zeVtwdYnmD(72ziOiKSWxpeLyKf?W3FNRirO+NKyrUwYU9pL}-J`MsW>xA`tpX+EWTd9O^Lqrcu1f$5vzXh!=J?Vz%C zKTj}fR)EzpiuFj@unW)kFL8e{>g1s%fVCNO8JSsHpB)_=j_Mq1#G)B z*!&Zhmr4yaoME*0d0sy(TYGG!qQFF0m-e&pVSo2IhaS@)!> zD6{XnTViFnk3WTt1sH#Uz+-O}r1n4e9y#^qv!l!CO;`MqHf{EuRUBRXjoq&Fmwss8e{r)yWkUj$K zAJ$A9W{p<#JK66h3sgl3G}lvxxOltthUAI;`YcV z`*C((_pP~maf7~>J@Ti_>Sv2WtzM7ceRDM-wq2`d`QJX@%!aM`QTZX?^z-g))xI+M z^iY_Hb8&_GbO}syHoN*hp(gcnUafO^Y-#Ev8_dq}I_^lw$E^etn}G2H+3en&l*Htt z8cQyfTr7MuAm;{mFcMQ}VXU0$*>Lvk*!3;j4Xs!2V@|yYKQ-8CPZYF?XI#=zYVH^r znVvGRy)NDA*8YTe130qui`}p>*5H=cn!;gLwY7oRcf5_?HyiSYx{ltd9HzirGv+-t zX~g48U<`(F&q%2(z*gk#EJf61RacFhSxrOyh3x)!b5X9cE;@ZW9ABeb!%^b4h>bL| z6~rQA5)z9@5S`BdBJoH;luRZe(JE_Ts)O}OJS5TcL@OeiTT5>|)OW2TF@2VEfAF?* z%7#TJ{PXgPf99*Zz)Op226px;tHJ-VU*d>PartDTjR_i_Zv)WX{4z{i^7 zw0Ai2)J$tunSYXZxMEoIMBlpi-EdjJWK`mrfbsL!qK%vzEQ9=u^YtyJ9YC=M1uf;Q zqNz-q?Gv55UuOGieXyci#LvFw@3cud$+NMzBYjL}3FlQ{W?cbck^-|o&VDuYOc5K$ z`r0ZTlSuVamdbt6blkdmNwIO!w3Ch9S|9csrxN1v9rvl`6X!J~quvD$tUl!QcjIW@ z(a!e#?!#eiF@|5Q-$+ku5}e;}Jonb-<8h*)<@OT+MX#6n%g)Y)?|J;`n{4pz@aM>L zRRX_dH35Tg)v$<|+G*q&faT|^kub<(oD%W@7(fJTu*e(SYa~}4%kN*)h(S){vB)j; z4uqtkz`u%5!XP!8SpG(h2n?dErGRv3S|UradHVnjBJ%|yB&bumc@gaEz>NpUee z?*qs*00bN)i5!4TpSJ)aXWq;BX$Jsgysw1uKn_49Bd-`k$X_`l+pd=zqo870n~XaD3eWOfJ#H!jYBYWWV4An21Ga}rs%iGWTiOBfT$#pg0z{a zU>L}l2?#S7XkGIMU;~mr42Vi5p>!7ds6sdzoq-nlJC4SfS8$<^h8Fdk553S(x*&*L z+<+6Mk*G+`20Tmy(Q1CfL1YG5C?1(cLS`j{Wu)Br%{LIQSnT zfF!;7p8XCaLsX%FWU^3VkW3d60i=+qXy1R2N1@XH=|l6Qhr-ZXs47S$(S?0<+QN>2 zG!PV0nnorIRZOD_DGk!}V3H7!4$&621f-KeA!X4e_5D1R4LQPYee5Pq}`i(}T!VM8})}|BCUy5a@Og;t7G|`G5X5o)AiOAt!X0 zggOmDr1`)5_qPy*@lzjU&|C&yC>2B^QH6a}bZspNgHEMTXp(TX5S2Q=)tlc@ z^JmR}<%4MGf3pxs)1#0PXA2c{u89sDPB4JdnBNyxY%g@vpc0@LV`IsnVE4e_ut27l QBt(ZmP!f;dXm2U`Z{CMqcmMzZ diff --git a/plot/validation/nvprof.simple_matrixmul.l2_accesses.pdf b/plot/validation/nvprof.simple_matrixmul.l2_accesses.pdf index a075a901167818fa23ab2562245103a223a6762b..35596a21308df6495e2a815bdd85bf72546a3788 100644 GIT binary patch delta 4368 zcmZWrc|276`@fQ%$(k54VIsopONwNR>?M>4*-}IkA|~0h432CgyKE!d*t3*w*(KrH zBKwkEmqNayey{tx*PK7jJm)j-=Xt;1&-0wu>qJK~42CikCo>SQx44N^^VId5eCk$W zeOoUARd8$W3qTtM907^VV#>p{26)j)-yb!oc`zpC&L5Ac(Ur-Cy`O92Z<3M@flRumSXP;D8c#Tl zsU6^-(ng$YCzIz|3g*U1c6#dt{l?iQdr3c)_NHswJW`28(=+j@O7+{{zmxhKp6|gM zyfC|8dN(d4=xQ(U)@}AuRHR=2Xu4{tsHnJoe|MAtIFUcD1Itd@Xl0ohkM>LYp4z`R zutUCHP^x>9**01Jq-)(VuPd+#bby?^=i(T%{pI#CtprnTnd`NdGG-gfVDb?muB!vd zI^lldKT92mzOsXhMrg1Vd%wn)&z$G-h0QtVnF}LU@u~Xg1$Ny}4ikM#f_cn!kzPsP zP-n>zVYfE;d(9u8aw&wHql1ncci@XC{l@~irXFSB?Vl-hd=+%6*v8-L^%i)7yk1}l zQ=U-iJv&{qc`CTw*k*P$D#$6BRB<>s*Zo}rbM0#~G57LI_lAp=H*a5Q+7qQ-yMh4a z8y7TC$Xrfud=ladNL?ySLR;#W1SlU9bZk3|17Q4@`#jfzjX!wE_;+7Z-zwBKVhs|DxnMr{ zHcC)FeuPrIW~USWWHf{0UGQWF{M?c$w|_#cs!*9ykQje}cdZVF~f~aLISaV+ZWeBv6Ewu z`x85wMMau(dGCW-*xVRd+zh!-d22bRNosng$(aV=tNj^;Y@_?kz$(sT=U<#n8?!rH z?Z-GE8w)+7wGThPYY?*$0n&EMAPO*D3h|Vt6`M6wl&$u@lO}$b)Ex##-AVf>v+NUP zN0Y$uy=v3?=L3Tnq^+RRZ55hgOrs&@60K0Fu{6;z`r%Q-FKLZX2HiAHePrfw)vCz6q2~jpJw-uC@%k0I?$(7gXZ=2Zh(6x^7=ro$KJyVkFa;8#_wyD$L=_&@4^5XiCw>$v5$HH}vGbw|HN1}| z0pxTpX!VgkS7`4F`OOQ02PB?=g+ElJMO67*DxZn4lx)R?qnC1iII2w9p*tnfDJ z4>Y)g_VfwK2ZSv9TU?&U5$v@8B&OU>fs}d-bjFxeR+?1W=ett1se-ej8hFD%0mKa}j%z!AV@=7*&ZGDn#GP+zJvjxWWy3!@ z1c`|C#pzuXp0cBKGJHE)>!^iTx&8zB!`Jd+-pNQ`e`yv5KX#$BTBFrjyU|6f<}y3p zuzK-!$u?g&hQYe{b5@}B6|Q(|0M5~33Y4hh#4~B8c`Ua~0eXXg1e`au-unI}haOf|A<$^=HMCo#hprLwygrzg|Uo30*uF z{$^MwH^wI2M31AzD=hsk5^$>NmUG>B<@W796Db^LcRsO-7}0Y3Z{EoTujH5opNs0_ zesK#uOVQt&o&7zemX{>Yb&Vl{ECQ`dUrMWKY8t#Rv;|Q@;}l8!=`WtQCpC60;J!W% zEU6j{TukA9UqYQd_DD;rHcIQqc^9Xxyrm}Zg-wVz-=_O0=oaHMZ&x`|X;u6as1KB6 ztt)Av5_;8xhnR@=OYQgj%ZzW3xY9jTBh|iI)Ba<`_k$MbR$i6af*j>b&Kd6I!cqUP zy2C?0icuVD%!BIKJ?`uup z+H|Et8~Xa$ALXyF)DLns*KhfZR4#Zpg8(tQ_s;ZrEKoZT3R*&)i#YuK9K(NpoGWHrSM}m5Wq5 zU1xqF*sN#@khgzUXwfER#wWL(q}SAYe`O@8Q7q2xc#A=^t6X3P4?noyNV@OJLd6|z zALCY9-RmFQJG&fg*s4|AyT#{kpJ6KAjd$3~b0Ok=_6X*FJno$1dXw8j*qkE{nAmsJ zn{}-she^lO z(Pu=6hiw-s<9S&l7wWre*dfL8N7tf5mXj8-^PL;be`l_Swe(^*>)|0C zg@Ma!t#ymwWE-Qh$7k6_({>Yf8wC2NvTqisc4hic>%TD8o;>vsTT^`W?^}K)vM!zc z_v&T1dy*E-vv!JR+C9WV9EGwnfxw*fP2mt?+2ttfhO(-T^(8du@&o^b%AqR*wuYO+ z!FleiC3;0EnN*&;`W(v*&S&r0B@i`wLEOb%P%EcJavNE7)ci?a0AJrJ(O>Z19jMqa+J_vHLbcEZHz5a|BKFAJV5b-tqoockFaMc&eMY zH(PM73_P~t^%*2*Vph-TnKSyrS{i0ct;~P1T}7d&1Cb z%I}|TEv@y`c9|FjH;R6d)jp>5XxOSaPc}t)++*nGsY9ZyLrudMgNYG`xAit$**@=V zzVDctUhFO02o~(w`{NYbq#xjVgbyeNu^vX?P_VLBpam{Q1O^6_$chtn4KFz}!C8l9 z`0E>iH&Ri__WAi?bGw!3@Xwb8h0dODbnj!=Nf?KQLL>hITO!m%W55k+EE(WZ{`Zix zb5%iC%T(`5oJ3{3LF@b&#bFsMm=H(Sl^Dm_1tZ&f49lHSPq|V-Hu0{n5_?ZeT?;l# zLAl#%%2%@Fv4)!sd^IJ83FIA@(zn+zD@jVCP`GTi&jpOeJUpVmM|xq5UXM7dm7mYc zEH&Wv_HG56bGN*~QD#{VE}k|`G6`g6c84x1gs!@z@(TDz<7cWAV5ffFYWi}2?wBEoUbhUgd#(k zm81~&aRoa^cUy?UeMe6S8UsCf(itSW(UY9O(qB&j$$vHL8US2>|+sl z2n=Gxu3+c#zzu?g?LR;i>>)__@5ljyMEs5%AxPx!$O(c({f_QHkm%ph-LjrDvMda6 zU{6Jbc2~I|#0tm2Q7Akd1z2CuAtLs9_AiJtg#A|lM`18n`ZGKo4g=pO{%Ce!(gI$1OaEOCbf??sbB|N}^MG_IeM)L12V6iBAARb4D!@=p+ z2!=!8X}h|A{l5_$3Qi{wheiE1g8df|55v$U;}P@%@n|$0@Kfg_9`p!@A?W%DhoR}W znAV_!Lxsceh=W6g!;$pc3CH074@dmRBOH!L9rOsNy#aJ2G&ct~0*=Dr=+4lz*W*AR g;b`oEezU+}a0~*62VQHab6`*?3_BF6uCKxVKXD}@Q2+n{ delta 4948 zcmZuzc{J4R`)-mYVTep&%*Yan**E*X?}T1!Np>N7UPFkiF;bS-nk9xrD3m38$v(D> zHIY3_i>2RG=X}rm@_YXI+~8_vOm(em1(e{ehMJx%r#xyEm#gao_emeK%2iuTw9{y-L3pch@U+XnIyH z*qe}QX?!oF^v4!)8~rNNJyWSu);$obE0;PW8aXP>nIfP4sN* zOT%NB#VtJ`OwaCEfi06`GRGDLpB^^Eio0L&q|Udnr?Dwvz>gESdGao5*OnY7jL&*G z;T+WczV-Xg#MbEB#rXvH_nU8rPiez`V)p+&=mtHd9(dr0#DmXoc5CGSKG>hH*_5Au z*i$H9pJ6Rp)jIOoOVZwONhe=p`UPb;^M<~>#LxI5mQI5&Vn0;v;R1m{Qf2+!KR@h! z+L`=nG|@ahSM(^v!Cc4&EYtb0lKL&V^i8kQYrd=R)i~YsLaPHIUFOX4IiK2-YaLHV z%#=Fh|IDiN$RJjx>E@)_nLhT}Lc65u6wElUKakYm88rRUSJ3PX=}N8SiXTrEZyHQm zHeDq6s)_NMxoul}iS&{(454<~+vk2>CKCG1VP_uIevP3v>yD!<&oh;I zS5b(hTS--HnZ6EZ^`fv0{;!iI7kaXB5$4xQY49WlA|xxGG=I{a%H1#!$mRq%$dl3+ z)})*WOshUF!8XvTTx*SFvAv`ACC7DJE(-gtk`&r#cGI=Eudq?8XTw?GRt8Yu$+?TY zByv+P3%-QH;05w_5rA?T4)5Ahd>hp1iZD4X<;J_mT(lXZcHOND3BAte&-l-refee` z{u-)Bw2X=q6Qi6u%;%4}ye_wEICrg-~OyhXg zY*_%vEL=qpqP~R+8?X=|S1*U^v7f&;CAXJrVykBgB0>5)*dTdI)R7{Cn$jl<gA<%PB%@HY;ENNuCr=jE5f|r(adFH(<5Ff~ZS)p!#KqG93op%Y{?2J5 z`r3DI@h2eL(P!^ACiLaFy9F)q4p&dJs6N)Y|HiS`q=gmn4|J|4D+LH_m}h>@8sxnv z3i(2LA6dJhfOKJh#`)o{MRQoUWQxl7q6f7CUsMYnCo?fc{I}IgJVpCh6npAKyM}Uii%EXII6!-R12es3?R!XZ2;O1UW zHM!wY%_OTvNJsA_)m26FP;JIq57j`3Va~;A2IuK1yorF5bYnvd1xm0_3%- zL&M<3a|WU%pN5+hU!)z|(Kh&nDlg;vJMQy$9}jll0^5<-qSFj)&1Y+35Ki5+Vy_fs z>>;8Nyfl7&$uhN8HO4BB`D>U9_GejnRbC!1eeCZt0*!tLV?F@X=2pPB`y!e+-1$BSBq&j^~nwU>qRr+ zJg8jj1?@nSqj*97FL|*>_%@@#btycP^07C1T~u0N_d#;WVWM0Sotiw-_`sE}s=RXC z|FJ`{g1V>417g1O1eh;^(5awW7V&jT)Y8D-qDxg9=gscNU7fCF05r_b!#4xk)-+>q z-dye9z=JGfyi(D(&Ea?M49u{CTTjK}x%ScYQc;3_-9v0Bo$S09k;0_SANPE{xe|(U z!1PaBY4{7~wKM)!Wh3atuMOF_`hz`Xzp0GPoY=9;8qTDK#ZpT=Xkc6b3>ln%0`!Z4f*I* z!p$i9Re8COrle+0nsK$7>zU13PMr}vpXjq~BC!EVvzee$CnhH+Y>GORlu}(Ko20W0 zGoa3AUWa{tryc#)rWe~V>Hd^T+}j?C1p2KtsM2F6_d2`3)=cVgt}&^7;(%o~et0+( zMD?HZVv!0ecoZql>lvY(_3>i*beUF@4a(<}Nm8-*hcjWrg0bxksg6t0h);0R#pk}O6Z-S6y>mKZ1Z7Oc0c@vQYY^zd$p_iME-Kj|zbxR|Aslawtl;&Zg8i3zdDnPV+i ziLxjT!9My0eew4#!tu8Js_$Zbye)A6Kwt|Cc9wv+KUK|$8kbOEy<@GDB4n9yK8 zch@k<*mA6(*0YXY&_{vwX6=0n`X>)11IP$OV5=1 zsTNBEX%)c{F8a-UmK3eJi}W@r<-sN~E<3ItWASMoaC|U$n>y zn&o|r0O#mVgRuVu`vUvU0L1L&GZtqffM&0{7hPuswAKHT4$pFttEiG=m`LLpuVRhz zPcNTO8}`if_dMoCq|j;lH_V-fO$Ihx+BxaQ+&~803yY1{zm(rmUrZNYZ~1`v!AjR? z07LSkhD+pnC9B4BN_)12v0Ys?AfV?|L49NG`?5X}@lHFr=ZO#fS776xnMoI(i|th~ z4EA_?3G$~d`VJ^Gj@sTSSm?n*ARnfy*&09bB)FEaD%1$K6McLJo(x^D$ZW1{^ML!k zY z9=W?7!RySe{6wb}_TXy3BE{Fvyo%E9TL5USE(g$c)!Rafg)P@_4`@ZMwBKA|{I2x= zoG$sD1A#iVp<_xJc)Cuh5I!M}O`^SphBEs2Ti?C-PjV0Tzp%S=0@wKQbW*kt8T8P9 zV02vJ?N05lXV7|X^tw{%P+1tCL?I)4QQM#stwC-gs-raMUngT`VV?QAnj05g@ z#!c4Lzy|UvJnlGre95^ReO?lha^dX#9Aasp3I~rEi22&)HzYW^u01*B6|QcLKmC0_ z<^-9JD_}-Av+K6pHHRG~*|AYANtu<1QzMD3DHW>*KIOP~eT-gYcnc=ekFX zH$V+09cy$e`a`WN;{Fd85+7OGReIedue`OoW^~F=Mr<}@@yC8oXr~a?a_mNry*aV` z%gg6?G9>bsr9eP`*{bNt9d^mNn{r*i68q_D(fv;FGi&b;rLNWkz><7&wXLtNybL*X z+0?ddF=}%rC~vcZ+>i^oVfN`Es8*#jNEfG*Eqr;zAc=kvwt6=wn{6Tgam#NFzs?B0b* z+$mF@4BA>9!9f%c?ytU?%oK{KOERv#nR$0*_=0R!%_N{||BKMU!G#ZJbVnNyma*dg zWuAoKO1`P}`jnB4xaI&wpW$c|LEG&WT`%S=8Q{DxnD2eKZr!YI>*M{Ul5nE*SMc3@ z@MXn**H0a5+za@Q~lz7WCK zRhLou1VH{0i5T}f6r z5$jf5{ER&gR#)UxKHLnsed#y_L`*&N4n#g(c=b~*&^qC1oRo)yhQ9=T%2V-^DK!Gl zZ%ixs({z=+-gPI2v#bWmcE$y>UBlJ$CN7^P!)I=_0mUEZ6wfRna>Y1x3&V+TBgIWG zXFwF2c$9Ze8|BF~4NGZHtbDHa*b0tG@>okN3YERmrgqD%qC`sNs}Y>OgZ!?>>DhL! zj>#s)Vy3`FY(o>La&PxzrI}80e1?w7YnE`flQ(C*a^H|ew$McPFS~m(cFFW>oG!ZD z;Y<&2@Sb`y-4&(%_GE?Jv#;iQ+JcVm3+B&ZQKsz}<;}DU%)u*ObKn5;p3ddsFv>v5 z*lqv52$?4EOPoK) zE`UKpVYi(8p%^R_X7A+S=naLrJGt9Ku^1>!kr{`9lCC4w>6sA-G_A)5i;n zBtYR%BW9Qb6b1hkL1B(i6yi_h1VthLL@rPi>QCefMWO#hZcr5FPxOnWG6OHp2!NA1 z3TT=lkximFH<;Ybt$0mmaSNH`ir5?9c1z!9)e_^$T zXq&&PAI@Qq;IKH_sY5s{j%L7-91cUGDiG*!csPkiQJ9Fs(DeSdJ_>)d779b4X*+~N zAn=ECXgvBb4*vhtgQEyInx}9$@n2%$Xv8n`|3d;tBeB1M|G}ZLM{!u>VX<%w{D=W? z3=nExby=g@@1_l`i&+CHQNfjl}Nv~uvnZXuA^ wqZ~*)!tn6jAuwq45ycoh=CCseEb{1U#o{qoQnZTdDI6SuVip!w)mLNwKcurX(*OVf diff --git a/plot/validation/nvprof.simple_matrixmul.l2_read_hit_rate.pdf b/plot/validation/nvprof.simple_matrixmul.l2_read_hit_rate.pdf index 0fb632160a9778956a2ffe10b63196bbb56c5186..29c65873457815b9fdd7edc7f487af2d0311264b 100644 GIT binary patch delta 3715 zcmZuxc|2768%`01WD8j*+n{jH%$YeeV@vkzYo#cAvhPcDk|axGnO4iRQTAjo6>$wI z5oJjTS1J*8FOjA68@J!@e#-p5e?8|t&-;C!_j%uQ8nZaQoa3m><3OCvsZw~shORr$ zlhlaU96!FurtoIis9rOza+E&V?4IY@B=zKz@a)_YgroTm;5xl^+HOTQZCj!D4KMfF z=^0}V-x-^U>)2ysV^LaH!kjO#I#f zW=@$4#ZbRoBPaRxvo=P-4RZX*Z1}RtT-x%t$MiYaZ@zh8{@oHXB)2*ocm0I@?Tp+Y zt6Yp-86yXm+)}a4oIG^odwuUjr<%`b?T@ar{DNj}xElF~3)c&eIVl#8!pNP~>~7~l zN%wA{o^aP$rRRyKvpk+@*eZJXJ5xaB-j>UiQ~9rR?OIpW2Ovx@CO`B^Tax^j;esVf zb!q%d|0m?o>I5akZ==qWoCsGOcPHhEx=GF>cg1Fk-OdH$yoB<^6W%%o{5*WYH8)kl zcDRclFDogqxRxAsYv=JN2bjKNVsAT0H|cWkMlM&c;xR9^0xZrxd?ht)O;hSjz9!OQ zI=Fcn?JG-7dub@1ALiCHSC(n)uU%`GbzGr)V+}@f^~!~u8l>qWqv67GVCPfaA+$wp{kOXTpTX>&C_EGtz_>BJC()0ZmlGHH<;}9qv5T4{rc*@ z_Y%{30=V zH}HvxkRz`SSqLBtC&S|Wb+dcU3wIXfWC+pT8BnB`iZ;vssj&T7TY={xh`aXMk5_S- zO_7w2Jik4hy6vX;$btgd*pnh}gUP{rqsIhkm?S;2>e+8sbXJFUc(qKJWqEL^_B}T! z*nOT`fv3=hY`u5+LjiD0a@WB+gQsV^sPd0db|u$Ed`kr=NSu?XaP^WW)}-RQx}u!ZER;w%8i=8ExqjSu@&Z?bF}?v3=iufrwd|;?lmI4uMt^f7;~_Y_8v;YFvV1 zs0O=7AFR>Q?nJ0+{gd%y0%r0@J9x*#9Pvy19R74FRa|8U7hify@%gGf#z~`ERlB;4 zJnRULW`Xg&91*gPW|@X=U`T#o-fYtNUMKI7AU zTJ%@au+bMpCB^jiw0ZQsqL4I3tJd-QcEP#o2SAeC4B^59H!-cLx6_|qM`SJ?Gthls z(&RJt!!$*~k(9ODTvAkiqV}Ptr^6z%LP4^|$g-GD>f7>~pf!FZQEwz-*UI(?`^6U( z^qUn|AJHC!L`Hog3#eAeK;#EobQuHBoea4&r-?UQTzV_L zOzsAHh-;H9mZKX({C%n|$4pk9Q62;eA0Iq(fB1esdW-(T#ViA-CAoS%Lpg&ByZEX- zV+Z!o{Y0AejgvH|VS~m7$5Zui)$|)iQTr8f(||bNc#EPzlZ?pA&W=7mDoWg(9zQUb zP~*I39*z)Of#+}d7!iuJ1+o4ZbN+WN%~vq_N?I&(;v0j>TR&PpO1BnndK+%7+d}Ot&M_2 z@0^3)yrpgO$?nFzD6SlJ$bNFOe*q8gD11x%k`oeI#GqO0CwmedNSV8NlhsonYh2tS zthns^w`bpY_l$n;M9aGv}Undu>-1&pt}g@U%T<*5rKOy@g=H zH-jWca+zm%cF7m@fMeFRePUHAU-%vc+$4ej#Ges%cxr-S0d@y{AFU1J3K zeyEHy5Rp#@_l3oS_cM)*H9BoRh_2WLxn3qCDu~3Bdo1n4c`s8 zkHpSwG>fwzQ;PoLTp+=*Z{IC1PU2$kx!6rnw0ivTxMD68r0KhVhC8v{azZE0eq3m^ z`ca{D0)zB1_@R_&TW*)`WN3oYnNB@!opPjeq%OMBxg755)SOzmU`&G;v6af+f|ZSe zFNQTl6kKr$D&yRTo(e6l0N@8wM@xo}bN@upOZ--SGE+TO+*j9I!r!94nYp1~aWPb{ zIJq!W=!6`qc5jCp%Jtja=PCdB(0Ip{X6k#SZ}6ZV$}7+_EIic9Enp1=xgSG;STulM z16Y6r(Y^UH1URr%eSSs6($+kX3*;G^5N-H_%EV;m`IMBTO@3$K(?{&Zw`=eIBl0E6 z;_@4%6eY?lsl!-9nQW;>L+%?=Ricm4+LQOrIMEFP6qKPGOuWVNC=uRN%Z1%Kr@QG5 zN}ST(V_OplrEa+@;BhIW{z~62jC1k<8WibeqQT%U=1D&=Fufm1+g2=#vGy@Oa5pEX ztZa+Niq7fL?3k_AKQeAP;^)q3o0XIZZ^jISwglAj2ENd+N##EOF;8G}fnsz5o^A7B zY6xqoFJ?leUJP2aJD-Nb#`<(xSA4fd24_K+(Y)1zDuq5;Pf>5f(Sh)_3gSE3%3W>D|2Fjgj> zDkLI8Wy)kwJ7h(up0Y=&l4ue7sH_$T{mu@4QBh5A-$*aCW|(g@8c#rLdinYs4X1YF zF`O)nqJPIf=CW8LU`ebw*r7DNgCauFIAHAot?7ftfj`kvG;VEBte7tvhx>{A&^YKP z@<-$FKT!Z(Q2Pis2S^Pu(BA>7qBIRy!w&Zi3N&;J_d=@}lCc052ZA60gdh?~?J=-G z2rvMnfwhAUM5F!zARdHR7p{-NY&-%1WC{G8M<5W`atQk04$!sa`Lxa01K~q3IND29spP(TOR-p zfY~m@VX^B40{=%e0D&-@2dp9d8e`RG9f%0B=DK$N9SDF2alaD(d5gz`Y%u}=uysek zLhD^+Sr4<-AixO3)*TEI*oZL1ZYzw3)^7&@!vJfme{TdPt}XKKF(OF#*$DDW$3zHb zOC}Q71rkXFHXaEJvW#>n#KU5{dPh{?-Wy0z~#0 zj!g^*!t6_p1&GLc)gTtP_GSNG1Piej!LmC8;&B8vF?g8G1(1OK-w#0G--QDSFaf3p SnHupCNLUD^q-11mjQS5q7N*_+ delta 4154 zcmZWoc|6p6_m4LFE;}Pj2AS`7_OX?13L}@tmaS|V`{aCl;5~LzvsD`&tLD?d7aNW@AH12v!1~4Jei?9lYwky;3rzo{%oskR=+ra z^V4aD)dk{AGt@jl!L~#`#+~D)S#3>nrbU0qcj{JdVITV&e?vzDecz5vlloCMhOgYm z7;cc(3#$xg3Z2h2295k!yY7cn%L;c7+htYvvKshtE+#tQ$A&GocWG5e&>}u zJmdxCSUkr~V*c#eh6G7}p~!z*>ZC(#d}mWl(k~<#o1bXu_A6htqUQEQgvT({m^92* zo_YInravuF|Eqd+VOlr!%k(U)(O_vUBfId+gA8)*=$^Ol5As~;_aBox^Lx~-=eA*= zew3(%f86_47wpsm53Gn~#ckP>%;l04^-0aWaMJSr*;u~ODM1!Ln6WamnrZxL((X$9 ze)f-MzPb`Fj!ai)m27D zZH?)DXF{rw!WbEU)34FqU2_XQEhR^hzV$61g^L_kv?xl_xpF=|P`WE%8ZlXSKC)Mf zEg^5Vpp|$z(fH5-nZ-8%Cfs_aJbt^ZE(H6_tNmq zEgeq_#H{VmU_`BGU#J6%xj!)}1A$ZQv0e6XIr`S;K3bOC&dp3KPUJmf6Hrv(=z^FH zA2NO!z}=$npJG4>Nfjb#0s^_8}yo^7=F#NgQ71_IQ#Mw`7M4aHrK z?~!TE=?p22JMWf0Ig(y8UL}ogcw@IK`f8+UXoAg0V(9+GlM2tNnR$VdLF+nPV~%f> z*D{Ansy4|IOS_x1LT+4IOI_;r#ZuKUrU&T*-6fk^Ds}Po0V#IfDC~j*sqK5(Hp$1h z{H`KpT#>nWKih?=$YF+OI%mS}hCp2T*)y`mzds4}pD)kp4M2G-npAg1q|W8UOiTIA z_$5ZWsmlf>bAIxj$ak%axxHAPTbC(z<}YXFPAw{Vyh5s;&p{C-dE$y=fvbvqq}Q9l4mJA5}ABa%!9(Kf$=HKS7cjiQ3WOjTgi+@4R;Z4{xt->aBt(WEx(*t zVy&!GQ<`pt2RqdCgN<<=>*8@M^Y(YLLPlJzmCl7ek4LIaIs@fl&8Mr13B5Y~)~#uE zF`u84xd(CI>Pdm&pKY_OAJm*9QDYu7+Qj<|%++Q_ujxl|*&?j%KO5Mb$=KJh46VVr zV0U&qHa0%?g%U1=uA7gBtCB_`clt=?-^&xQn&z6C+76h~%ifQI$@tYhK);yaTpoQv zxGa_0x{ zRg`Tw8o=fEI+Ar5M<=1DiWytERA!`bm5xGuEZnE!PO#!{Xgm-#8dmYyt5&AkG0KSg zuHXGv`fsp7CBHj=e0rE{G;VlXQJW%})ZFORA)h02q^vIe{Q5xKWN0rFuu73)2{jiW zmxs2@`=2z6@KE1QDf#ZDNm{j0lnEVOiO;{qvOm=Rz8O%OjJzWw5}-PvB+aMWH2o>K zk_b_7C>wBm?Hk=*q7AcFH06zTYL%4j)axA_$y4}%E|MR1Ce8!O<^Ugg1^fVwwif#l{JW{6^!2`CI8mMns7P*6zbUL^ z5#;0m>ZIy+O{s>J*9Xa0b?&i+mA_Ois5B+WIyCB^E{f-U6Zq!koY`EohT4Mg(afEz zz9~lIZX(?#&c>)BtnS0d=UiUT>v@N>S3#X@ZL*s=F`^}&qAjV)Dh(GfVw%Qei7>^- z>V5_QkLFEN8RrulrOph{!gn$KwDGTe^nYZZ;VOwQy=2OM$&T4*LGPppNB?GNC2&;2 z9bHpt$MmW5QLu3CkZ;4YlOosk0cn>#khB^oD{|-W;36@jb#pu5U%-qckLfq%C`g=qTuE>w0$){_@YxufdMx);Ue1njKCc`(IW4nOG<3>y7@vkBX)-U z{v(15+EqzlwYmEQFRDc8&3KH+o5Z_^LtG>BU6`FLcXc@2#O12>w?HVfx#406AFOXnS z_k6Y~)i1z4WO!Ibmcaa+SCWa&$(XIe4dIzYjWYZTWpZ$%c3PlHZlFK#B-_I%G{IBu ztVmVFHTGJU*xHAGHhx@69uec$tsi{9rf1Ws2EC`yZ#z86m`k2@{>|`6K!bM3ZxY9| z()FqpuFJ7e`F8Wtp3G&ZUm8W}dj2le?=Yq)K6+ns*3w3}RL03><=PUbdr`~srkvYm zYv+`Sx3wGjz2}aUN|#%y#Pa6F8`q92hA5Ea7yj&(scNjt5GKWa-Z)bN**9@mEO)pJ!>(}pG11+PKMu*kkX1%>P^zONP zsT-8M9ylSpm)As5j!!tH(E6g!>Sch?%Cpp|7!FUHZr2??;02~FBV)X`W6(~pODWvw z%iu14xDyO|z?A3(M4tAuQ@yyn-XF#yvMai=zF8miarfig+{cy$YS&JSl5^n76F#m* zCdN4@!=$>FS5X6msDcxyF|)*%&!ggu-iFKCgk$T~_x;VNT&0r3jj_zQgu* zQEogtJQGK(BzgB__=e?tcA-JL÷UK-6Jl`f5&QdNH3h@h}Ic{=*}`#RZq93U@y zR~8Ts0U!qej>KTArkQVZP@05NDP=;uRc^v}7$_FPyc90cVaj(AUdp~GiLxllOSvs} zn-V6*O991+6mfChs#0+k2FiVL-lGTvoWd<5P1aA80hvAxK*J(Zn=l>0BY*V8XYYEo zO+UMwa{kyYkK*DAt_0YVKdzS#G)aUiGWH9Q@AJPFDCH@%o5|y}yU}l;rq482!Ym>(~U-jjHvi|3#`C9Mw@c294E9rkks9kQw`*~EBC~I#S1IOD- zU;pJ&v5`kGq4rx%JaU7s>0-3wcBiV$2`P!01now)C=OUS|$s#k1TT$0iwdd^x@!(z?aP+HXVGc4gtzLR3X7H>Ni zizE9#lnjLi^+c=g>Cdul$;n<`)ww@M_!KVB-@&At5k7mmmlV4S_kiI1~LLFi#gxC(23WnPV(4CrA|>*>;@Zb=?;N$I&b` zW`Q|F5WoR~z=#k8_!GH65C?|SVr~!w;wN&4Ado+i2Lyroi4ICv=_p@4$pBCuX=q3S zP!^aLZEyTtygaq-{GA|j+RAVMjsQUb3!>lvlES1(AYwo?8Uh?Vh$n>QZvdju0ImAr z7=kVj17K)uf6c>yaJs)35Q(BZ)x;ge!O@g$O&Kx{pmF~79})yHhhqrb!K(k7hd=@} zU4D(BK=k1}0FFZ*<_pk9|GNkIAHxC&IFcs*F9iVv5_OmifI!oc0gwQat}&9P-r;&k zEcTxzx)_c|_wOJ7bqGKsuyoU*(eT5n0%#2Gup0mdP6p``F-Y37 z9%_O?(G$dA=%>SAv9z^3c>g1ah11yyVA1fOYLWk$h&$NdUq9mj9GxXNB>i+aG=`2e z4)>oX|LXw)h+hXe&>y5}cvx22Wgrjx4uVL!MFK$-{V{+TIx-*}4jv{5!cj;veGr4A z+iEzDz6pX}84!g9535DH2RgNAcmDsE&hcNLKn$FAk_C7jRty4*V3Cp0F~qa{AB`!Q AApigX diff --git a/plot/validation/nvprof.simple_matrixmul.l2_reads.pdf b/plot/validation/nvprof.simple_matrixmul.l2_reads.pdf index e3ad138df56b39876bc239ed88fcacf1f4fdcb92..056a99565ff4db6d293cdd2a7a7b63242bca59c5 100644 GIT binary patch delta 4399 zcmZWpc|2787f#kDq=sZ|#;#=UYNeBo)62G~jmU0u(&JeI%63o4!52&$fan3bJ7Ckb~f5;&cozx3Vv-0Dnj z?z{N8aE*^arIl6Rw%&YN=ohZl*(3+!;uhpLDnwV-q1Ud7w5)SzJmZQq>c4?f6bmb; zX*B%7sC#6~AGXVNVT!T>RER4})-*|+2|)2GXWF|hr=gpTs$b<7H~S~qWllcTF(gyP zby2UvId9By7nI+yu5rH*qF>6x`?g6SrsKX_)0cLkBI5XFQ>|U*{M&h-;k}z&!kB=Z z0&ild8(qxsoU)#sbHyNk6e2XdIu1#DfRIm8rigE0w?@iy`YHQxp9*uFJZ{x7n@ui3~pP&e4p5I9A-r?~Q)Xf8#1x&pUT(523N5@JCh*=YNb3MUJt# z1&Z+kcO$pToUc*j`#xU4irGLubL}7I3oymrI@Beb7rCIAWQ3hT8Fky#w$90CaaGcE z<5y;r4I4PY0P5wqy13~cTRB`dc9PhgxYQiLhfo?{3Y$nYJ1$(9lH0rG)RMhMZF-p0 zZ4{Ss$qXCyaxT~)*0Ni4*-yWg(8`_g2A-3I%&1ZJhf>G}^p8dzk?&%==ZCVpb5lKy z-3z|Ylv!9FnEQgiNC}eC0WI`yE%=Hws6ZpJ=lcdfM*Jr}6d@cX{!GayNadIvsaFZq zGUi`mBWNpK7HWv}L#iDfqWvhnsKOwOt6W8z3*0IWstqwwRH1?y0#Vv$k8=E&FV~MA zHhq*ykENu(c0F4pe^9Z`#Y-`_BaY~Az~e$yG$Hqi6HgB12K%klEM1VQ0C@$_5H*(Rbzq>{`VX5eh>H8$2AvsdBG z7(*QHal1`uw&~FRm2x>vWgETL=Cqq(uOlJ)Bmg2}u@?d9A?BTQykGLnMnVV&nJI&I zvG))wA(_x8?A*C&ws%sS7DpXTjbo4wl=j?ErGY#fz11VO6IPdAhDeCGm(wO)Ik!b9 z+Pfm2WHQS`yf2YrE}ZyFm(9UgEjlic*}KUx^G&e~Q?u$5#;#DMCZu}n2zBdJ=meCbc;9&1r49V9kY(zzS$ zWv5AOvLbH}#3OP^qsbU&BRzgyzTwn#;i3$?4{nC!80&T#Z4gl}P(gm0s{T=ts{2D` ziu22qwS&~*!flqcSlW1N(n`oGc`)qFOW&uD)FnhjI5XjYA)NI}lA(UEsX(1Tl{=9& zpJCg1ljApAHC0R~3>uB31AVDbV0m+uoYNjD|B0=N@V9%W_te6rT^EK@%A1pIlZcGrrdNYINbRKWtRi@TtTB1AA+-Et&hb&m2 z&>Ql=t2k9%!rw~1S)!7u?4j6r5pUD()iFv0=5S&zMzko+ZWEV$)tsY}N&6;t(`T0b z13kL>`6FqUOW_?dXX`%ZzHjD&Uh zF84DrU+U}rRlsVQ3~#G-QT6K*qRpUHp?L(EI+xtw`qWD_XY%<;I6QtR>(*-_y_3sD zs%1*?AHy1k*cW*!fqpg#QqzWer$IqLy5VYf$JY{$E`_AJ zzw=wVDte`dwzveL-jD5v^>o|Ql)8Ca;2w2u5uQipnJP7|v}4-LFllb)^Cs+};SJ-A z`|~C@KU?dm>g}??pafr^<#X^nW)b>CG@D@08z8nEW>VO}6z|WT>0g&F;!qxvB6 z8)rE0(ZF7xqj(p2aom%B+V>=&W8vg=P2uS`6Yb0Ero(@u8gj5VQgGMfGu-2eFOIHf zv5l2#UKDr#nf5bF@1F>Asi{?<^?n1(?7RdmE~H&)nwQD9^SOIAD%iTz{4kNDn5nrN zG7ZRwmFnu(gMg!H$3`-^jB$NiqK;|ZuByoEl+2=oJdY+*WmAgl*?f8O^2Ieggf;e< zOSWHLK^91mZh7LKMS?BSS*|PEod<)lvcu1E>hWd>=<}Fc6*ICzXw_s)aut}@z_F1p=(V3+uF}*@WiY8Y zfRdH%#=o#8sJ6eh=%Gd4o{dW`6aHV-Za+CmRc`{7gwD3W=LRt%`sFLnnL~V^{!k)i zvFE_$(0d^|JvwarS1D3oPjnk*d`($C3M|t0Sj_sn8Lc^CF4V`y+?f%pij?C{nm9B6 z-mbMTOW~PSgVWWU4!xWLx)W!c=Uf#vsL~X4l+r|K<1Q1sbE|f3pNR38nmp2M-Q)JO zsai5S5&lk{v|+}v75&5K^TOTLHARE+nD4`fLtDoZq5?Ff+r&1N=>zxG>3TnG!ouXQ z(K%Kxe=CJC7M1%fJbG)yG!m>Fe0|}$=Ad)y2oYW4_`JNMC*^onLXg_s55uY7=A9^T zvUy*f*W{yqRTEKJT5Qo8?S6svd*9iz1hi~oTXSuNy{d5OR*vk_ro62+XH6ogyw71$ zxTSWZ^ub!FVi97g%2{$^f{TDye{uJFs()M{H>7hpTULL;dF|f(NK3;%dbj({GpB3W zTgqdA7t77#J}dv6PRzFt{#My?f8acYaCvhydZ@DW8+o9TQWoe=A5yP(N$(iB;b-c9 zt{{KKysSl-}|W!EbT1Rrv)Qz>tT+nc;ee zy0_v(r_`j<`TbloYBEBKE5;$weZnV#M@vRX1EJ0$P7p>nw}#~37l+)e2Rz0pCIe-L zV9HAO-(H*>_hmp2RujTJ0I%Q{v zPi>Z9BL+=`Mbu8b^67yX#Er^^${6uKU)k9Qxa50Db?z_rcrcWW zsob$MiTqQmG@Wa>#W!}Zl@F*MTfc|NaVSalv)!v~y1ODGxh=RuIf^5_O=smyS%GwS za%W~&Rr9n?N{^PfPk(n9TBsr! z!$vQMLFq$>2k58aP!j0?c@Py6ndvT0F42D@+iM0186L`Qw)Fx z39JPP01FQTfXpTTAqLnU_)ZK90KdiEFAQWEDu5+`EIbgs?St*}|0w_< ze%ncZVmJWD^w|y!LjhPugTvxjZigcfSa^8Uwn_gih)3g@zTbgiF-+6%z`&nQ007Ew zs__H@%S{OA?VsD9$pj3mKms1v$wLCnE$_%h0vPNcE7_hu62N13szw3?3=4)tvYL?j zUG4M-5=p?YU??PE=Sq+$G{&B#2?`6axF3sT86gsjWo?2buzuJe7DI2;)7=Z=01zT8 Jt7D`K`5$cFBKrUU delta 4887 zcmZWocRZE<`%Xr-ga~mYA$0aaMn>Xr3MC_zl9_d~GM>sFStn#9dmOTpLS;MlC`B0u zAsNS>)$gd^>-+k|^T&Da*LAM@xvuNJ-|wX`x~EZe#mRJlnGW%51^3&oqERSTfDJKx z9?O;Wq=oMbX-O>Buhd+}2QRjOmHV%@9o$zLw1>yjbE=-nDI$s4ZCaIJw4c_TwpqoX-PTnZ_ZR!=)e{Yp@Aqu!Ig z`HM&e{-7UUhNi%7Mu3}|h`BeqbY<+zkaFON&*D-C(40isY~xxO-r3n{{<#nYaDr3j zeyv!nXX%@Uzc@3culcgmFqF6ArSFstb!BYxD+WbJ3K1K8ZEIM#idB|8oW0=uw`;fL||R zHe4T7I5<-?flHOdW~em9&A@-<|8mJJF+UA)T{vIkZu!_=>Px{TcB=J&u1QxK$EO4V zGL~#YkFSE`B-MrJ6|ZaauUK^_os%u;zqD`-xZoFw|B~tXDCt!r-lR)R?mb{Qm3NYy zsDkgilG;ddUWzJ=XQ%g25#PMXd4EA&XlH<|GMbM@eVjJ9ku~My@2(Hs&?p&n6 z(8GL@qg;o$M~WGYgmlC_z)5ztg7$t2F|uowCnetH?aa;K4r= z6?Q@CgmW9!P4a2+x;A5PIF34#nP*Ks3h*t}C9t{K6ptw$9F%os)wlFo?DCy$Q# zlAX>7Oqiv)*)w~Gak_qQ4|?llBh-0Oy_ErL<&r1;DOi1#K@6z5uB3Q}#eK*$=skKf zDjucuAwY&)P)m3+imJRrrGgD*LZ2BNb?FNCXP=4~omS8kd`k|lE{CR{F@uf02JVrV zsdTU5)wmL^cQfSklPSkkPZm2Zq*-NBlEPbbgv4j+zQJ!@`hEY-No;$qrXw5ovl|ny zJN06O5B_4FW}ksPDn8Imhq#$3HenmC?ZNcY$*IM-`mr{FUg73R0-tBRK^^=v%b`1= ze)+s9wR)Hn6z42n2}QqWP8M*Lup_`OCUkBoW%a4mMYi+p5}(oo6>mMWMr0k$oYQt= zQn-0LdcY+As>2OQGmffH=XMn!`fuuvlf=-)Jentrbrx*}IA;B!3%9T7%hI5=?5jI@ zyY%qa4SEXlIy~DPB_xt<{1VeP@!)Hz!YdVd0XaP(rPee|@F12=EmDE19 z+)EU6rX}9!Q$b3{34LZc|CMBO6SQ(wGW0?fRPe&UF*D{*>T$U|DJpWave{$y6`^bo z*T$*^0CZuj^y0C(#^@}Qi8PZf=CwNN0?KRQ!*3ZT4w!gr zdO4e~=}5TxCG?mJ=VfVpCuK$60JAei0del6NLu9CquiA-BJOHl&N!)n4@2F7Yh#Sa zNB9zJnzsgei_7d-n@*aK`fbS#L0ugh&CzA8(Z)h>Fib5|m~E<9-OJhOaJI>5+4trc z3d@58+kPRMp*iN-#@twRIe(;-x_xQlD($Pm<|3Q;93PsYhJ)102;c|DTthVAG2GVe zXJZYH@K#O}{!~Eg$brhWKiddj8xz5+%4Ih1W+A!+=Wd~pZ(+c>gMzGN#ixRWL4+-(V($+g!yG>o8!p!+#98*B$L?Y%e$lyUn{BK#6LcaKh*p# z^{$8e=hHP?+ihHcsKaoL1`=uZvyo$=Xk^~4x+=)_a6+#Yr-S>fL}U--hcb2m8)!MMr3T>|I9ANc81-{jyB&b&+gkI(2x z5vjK_!rceUre3TuTDIsOFM2O~EXkBpjKQsuQ*~N@Lp&oKI4HT1midivp@a%k*PQ%J zIF*PIj=eY|P~(pZP=@tZhn-5zUA0tAJnwRawYThBhIFEtdi3`>F;l)l-d~@!WP)9x zL{p{Yw|CSEs@z?@1{yxnANc^$G)&W_M243i@KXt`b9KR+XtmJGJpqg2dm2JyiXowwXlABd3q?Tc=(fGwX$Q}`oJfN_B*mm1f0Y7kKBDYZEB|si;>;fSD6Nv z&x?l#d3}xhY$UtYC13rZzRvaw_o^x4+c4{U-Td~9g*@;vVZxJ0NYF4IUP+scE73(b zhlZ>uMkv6~CDnoL<>!P&((_f*LKml(J-;fP9tK5T^d94o&=IP6L77U7oPE)ux}H6` z*3G0&PBbBOv|;cAkv{#gh-&2Pi_qf@U|$j19Iy93Xh>6^4`{74&Eu4jH>4iDB{$_I z+FIW81rYSDpbHru6u#$Z>%etaWq4Wtm`iMI41-T;%-1P3uYm~*SjmJl*@U)C%Z{99 zofAsJByzn;`2+aZTjgoVcNLW?SKnS4CFq`CL*iA|C8kzK0}W)J+b{9P@i8f3lB8tS z;;qMsD#7HS*n704#3PS8t6yGmse1U_PO)|Xcs$%sC3H-gdsX?&&?Yo#G@Yu|vIWF&2iNLzY^LQU^1N5bhM+r?>(=n#f?=L(9yS6n~9Pn z5E~gbS+3iQhqhHKq6nJ&Y|FLkW=oZlY(H$|syA-w1V23PZjd!P(!bR7+8Z0W!V1{t zjeu3}hcQU$YdIhLu0hPWZ6D68dTUeO&R}DFEZiQJ)^8iH8es-y z^7DzzcUghk$(F-~?rsSe>j(mm6iV6-86XxnyROy<88m0(Y9qs-qKD5?%RZCU*MgG4 zVui8aei5z5I@ay4WvV$S{Sg3ULGdBpCo8TStGz#=q<-1QOerCq%&q9dYN@y&Q7*j` z?R=#=k9mT2qI8)WO;>83e`!0V6zqpdXfi0(O?U+8g!fRxU*V%NvSUUscx<24RZeg> z9&Bng!f)fi??r1>&(|>dtUX|A{=Sr~KseFi-g8+ATPDExTyNY@x*7z)9p7EQ!CSRx zS(x$reeG%grA`E)#9bM?TlDc?F1%%qiXY4fy#3rvlfbXjOhH)upEi+UhWO|dlrj}H=}$# z>?D3rcBuWmE%lp}+0Cr@>FR31bMWRIKNt?8dIlBmL4*~*HOHpQaFOZ1bJ_^BS17y;aJ9c zU;ewsN(-){rAB!mWtdw1tdVf>h`D%`TRSf$zv9)#Q_iB>q z&r(vX*mj?u=gVWMZtsItESlBpkvz_d9A_!|k?#Q(9)fbtxrB|eZ$W5LM+Da;!8%D< zrWVSr!04rot@z|QcDo}RKWD{dV-Xwi7oBDsQnE)s0zkE|=g+Olnu;K>`)~TE*F;Q+ zw(LR-a2Ntn7Q!yaMWjC-N2Ch#locG0p(7?7=OH!=N)X*dUlEl=d59_?AEF$Hhu95@ zB-YU0E@IZi>tZ}*b7Bf~#6dBhgK#)e6(kNI^>rd7pv+%BNqawxt3#7LhjTxMrEEJk zPu3VeQDF$bOCpsVehjY4buIo>C+a18u%91D;A`hEIFfU7DxJ$bvgonAla-c-9AjLO z3`^WAG28kpWg>~XYA)>^(f;i7-e<{=eK{@A)k4v;0tJvULKBxvW*TYi2Q*!pM>hxX zZwr#SayeO4wN_AV?TCJcV(lk+ouRpz3a1s{uo$O>#N3Bxjhi$HcT4l-)z|eQj4cK2 zuWU+I({YB&{ib8tcG8O%(CQ)E-^z?|Pa=|W>TlWoIawUXUDE3dPOTsSho$Wu#amAp z7kKP+kNPvm-w|TVqqGL=w6T;bm8@UI>4{rAOkXVr2VZGMsx2o+Wvjt{KMK$u6w5UN6Prb zWezfxwnFR@V1GUDgooTIA? z2o3>3K>8rC6$lRfTUmqPu)mcJ2oC>S*@57Qzm+`*j{IBQ1;J6f73&`rVk%aFgY}O- z;%^gi8mqv@3U&h_uzwoL&d5JwqJt3IRa9xeoT|qTK`>Ax41yq@QN>xIA#fUP_W?P8 zSpO7IBm#}zhan;Rcqr_yUv97!9}zdR%ay_W}pqoI3w5I6?&M+5jzRtN%u*lUJ{p!b@AAmEt&Jml^s{xJhV zU{U)pBmzU*;{SLwecFS>K>iE`DuS41e#q7t>FxtKQXM|7~lyNl^Bq1%HaKp!e&vlDX?SmI{x7@*9uZXMmSOFvtcE>y9UvVUwSF{~~TH5rXZ|f(ePL^%b zhPo%-lul3_G>Hlho&s%B0pTH^&$ujLmNI@>VnTvQ8Dy1R?Qa%?od(CQBdeH`T}ru= zR(=M7dBRTS)y8qEGU?{)6OjNFiT)s;V?R=d@nwQBL7AER`r#{6?&NnX{Dk>S5`G_=;^rr-i54DhewWSPT}95GHbp*9!-x+*UN;d3 z=hl)ZMLDlnLG)7*eAF4A8Zpe7d7RwWt?S=IpCrB%FUX@nri`BAF$`uKy z&GX^kGaLv1j@&yunv)*+k;BHW!*3UoyoX1mdwTDRn#qsAg> zoc7#isfP>0a^Qb`yu)ps>3;qTuNtP!`Y^E(aaZ>3em4k=va)oMJhK;2^VH@WtOvqJ zo;$SlGPo7hTu5<84AV_An0%w_FUy;<=HIxjykB#5&!6tADS%cNr2_0z;*>mhUsw}c z;fm3?#OPDLXxyP^;h1H-{0n1-zOu5)kQgz}yBIJ${6WGMUn3q>ZsqpW?WT0#@PmfO zPrtSWMv)<^#<4bxZini=9wQ&IB->+SCIXz7tlFq+oNrUli(CsSlSocGMT;4?f1RY# z?uxzR9iB`(SW{^0h&oC@u*zfpIlV~j+^TTh?0iQ=Ap zN5cf7{YtL!dT6F2X}?+MUmPpx5ylqU*iezs@p)YfhX)S)$9i)PU7ro03TP4^+H$Is zNktzh#N4sG&Qlk|J+m_Tyu%)ZN_>r_wIGg=#xLolP<`*7y=7EYNu>whsh&dJR1k9r zt2?}gGm`BrQD;h?p_jsuhsmTYwOy)Rei2W3%KJ47PY_%+x?7Tj3(J$_z7A+Rs^^wK zb{ZPKt3B>d3puejy=gE1JjgRhPHcQ`%mbSbHW`>z<%_>6=0PVVQs}8+wfq~6hPCk_ zWDU-De`wEeHe`ZR#qK3`INL%8r@qgWmLzlFtX)wv?Ro4Z#`Ml6Ek9=feXCb&qf6S6 zYLzwC7Wd{lVA$MvkN4vJj*RDt(vIp-izC1*$}#6^)Z?5Pbv`Nu{($zYAY3rh8V&z} zJ}+k9uF=vPrcz*w7VT#)x5~hKJ*aivErN3$$plVaSLT3p(TCyW#TApx5q&_u`f!tikb0;hJqGVABuq(<_{Xv?I#Rp z>;c}4rNn`Pkqfk0mxJGU#=)D^Na~Hy)cMPCYgDlw!z9Sta3x11GA1$%6I{f2_C``Y zCZ)-oej?a<&b-=`0kzZNNuTAs)M~W2UMuXIj+v0fFrq^n?hDmsdMqWDz-=dHYOdSK*>)EYVNG*vBJYVe{Q zmMM_v^_k(_edDP8*{_aDF3IzxfU41#if_l9hl2e(N-MXH(20sWyB)sbgE z9d~fMGTBaKe4i+|GVkO*_E0}NgjABElB-%AJ`THJT7z2M8^Dt*0()Zg)xAz7dT@7= zwv-T3d^~fvrl(fqi;e2Isr{}=0%=Q!g;F?TG;v*54t0cXPOOi+ohbD``B(h?5rc{> zE2)G(etc`G9CnC;ESNo9veilloI11?rWwI)hm!3_M_cq1E;(}@nAvb>c+pg5Fedh> z?5n({xz^om&g!58S%sTZ6s(>sdB)EbudfnP)bXV)eh4c=)zMm-mI4*0ZS-0*9H*jl zbSGZ(n$9Vo7Ry^IGEp249UN+ze=m{9m)p1c{?lJCn6x`;P)9GH0ZQZaW1EhmiwtKB zFawJ7&tRwTQaK?d6%~VoobS`0nQMKGA7ZV{zyBI^7hAxn^T{}WIbNXJ^7h5yGjUdX zFl&bJ&l%YraQ)5Ynm+o6u2tGy;Tqoz&|z*Z+lE>S52k*(rDK|tv7*5?;NFM^bbCc7=c-5 zzgNDdoH8nR5^bW8F89cUuSo75yc?nasriaiy>Z}P6>L#EXw94?;JF%k&?3u*T<>90 zDKCcX8-lQj!Uem0vVvQ4hBZ}P6I}AK5hQc%Ccbk1Bp3Q;XXv^_xuUAAubE5Tty`6q z!ahF?;wIBCh}dr4C^-f$C+l0%Xkxso^pMWLMuDIKZM#Iij793MPpc`WF?dFgPrJ65 zp4RtNYz3()pX@EDeU!5`DZf8@Q2Pe=oqp35n}_%I4(AV*y>5HHV4B>^Cn2kF>FUU$ zv7auDBXj@bx!+5kPtedeq;oQL+FtFe9weYM6vsL z5C95*)$3v(@mOzVpb0E591uFE&xMB|(8R3|n2s+3gZc@dL12KNkskuHWg9Cs0D-~$ zjLsr3*q>2gy{WE09|u6GJ#+{OsKIoMS%!!CoeMJY3L_!*o9LoZXbb=VH~@jLFydA-+c6vl$0FW=VF9*U92UbGX1pwU~azHJf!jbW1?09n*K^FWZz5&$H!?>r7f UqbSv8W-uHc4^vh)wKaqN7Y=`TivR!s delta 3842 zcmZWoc|4SB8$L45$t3%h8H^ZNlJ}i`MA;(QNyLfJWS1OU-$iNMzd|jvQ?zBK{kdy$XdRp&ii-t zi>Ij?cN^yX=ngjW7SmCO&y4dpeMG7JXz%uo-ofGJ=8sGD{+s(6hpz~K8}>gIq}8=P zJ-b{PJmmm@5o2tFe?vnJ}yrK`;D%^C~ROGBY zmA#Q7*Oe$`EfyH3btpqx}b%H{`V2`kapHexvtNE`xZDHh%$}ou-0GT{9KCVYoctb2iY;M1z)D;lQgK@vv>DQ3`KQR zI~*7|S<_h$Esm4y9#9uo#F2XGpZ+2(IU_8_to^aR;0wW`(qi)Ht`lk0o#d^*z@ z&u0~DGaOV>6KL4=tJ~rHPNA=AjY_!i^*$m-JSO-P=~t&F|H5b#LodDh6M5NkVsm7j zXR`wf^~l`#Ct0igw39Q>$*f1cLnT{|&2W)2Z~BMV_`aA2Z)Rtu9%q$ZFhF-Z^qIjP zZbL~7-PRtRMUA{x;Y`8o!}S)~3Z%@O9L67JBk!Jq<~<5QsYz7sGF^vHX7Cg@wVK&n z2}(zqgat!9P$rtQE&l>}KwQg&yDXwL=7Qs1NUHam;A-wci~cehzb^4q5`s&%EIfJX zvNhey%k-A)dDoV2N~Sfd)pC@6O?vU<5mWU%b8^*1%kJ!4YV@_bELDyRRS)HCg01P5 z?oy{Ld=JABokoD)8q6)>7RNoMqZh^8ru6jBJ~n?0i#Kq^WE&yTZMRyv{kBhSPLxEa zK?_?$ZHAEF4JdT;vvG6i(d4n7$;R~XIm0ntqvX1$DjC+&FSlEI?38N7jJB>^@$m^= zV%zM>+Gtzqjph+IJDU-J=S`N6AolC#;>Kg%!4Pc`hSw(6>&?cGDmn#d-%dY)mG&w9 z?8k*QjNuJ<(QbTq{|!e$h3u3R`zC$`4JZNGcoMCL@BonaOl0<+iqp(jFkfU?i99#! zU*)W3c!g`u@ixu9bFGTtOsFtsSTG7>38HbPa=EvLX>Awm;E7T@o5;R8 z)9&~f{_aK`Opd=Py2yP4HgG;5Q1i>Mh?BSN^@KbOw~KHOkLkvIca2Fkzk>b6Ug)7^ zkpO=gGTm!j+OWbj_{!}WlB=ll+vd0K6F#lCVi+&g2K@moA5x^x7oiIZkoYF19AC;liC=JBX*1DZ?T z`&Hm_O2*z}65A)Y%Dn+|pE|gGEHrcOdCjn9tQWj;w0OLmsrSy1#SFqbK12mLhurI;o2Ll6CF>P04X1*#!VDT@b4&8-vZpY@kX47E> zktsfSQzK?~*3ONWn~xqihyI*Y1n8{zCsI?cX|29)XZs!PEfaq&oG0L`_Pww2jf?aa zojv3tBs%agHKfqrBK3UEYWJXepQUtxE_+d?OnRo2Os7Tb;D^G+IeA#s2lWNTSQGe^ zWS3(erpO~_{|)KC2ObK!5&tF^l{?0wLr(@vvvPx|p9{!xA#DgByjM=|@v$ygj z(5OpKRV=gWHRnao95*#a)GHdx2Uc9B8x1>2+Q@Y+-X2ne97+;EdN_V<%A_mhq{-DP`*XFhsug1=dFk9UgT!SB?%A!NO#XQ1*X8f<=hD#Y*itlWK= zxFyHvg~h%{Di|?KD2>*5`TA40flkeZbziX5)5b&M&})ZkuE*CMo!6F(9g%pRBNvO} zm4YZ$q@IC1i}7ITfrquvB*6^=(Vj;xF82KL zR#CX!+n6WV`Zei~akKiVMD0_zGj%p7h6s!WLLgoYf;e2LwsGml@Bi~?Rxy=;n$c67 zSCVA6wc~Dj_BCob#Ju8R`MJidM-QXE9iN`jc{zP#ndVg4xc*P2o$beS!@=*S8sx?e zt&fjp#OP029)CZHZhBH;d{=0da$Pr;PhSyj$YyDI>E&F*f`(je@rSaAdynj|PspcK zgl?q9xGO+hb_YHs<0U>CxiyZ-&)VnJ*2{9;j4G&XlF5nB*KPLgjLl%E9m*;Ut+7hC zd;B=SZX~mE+$?Y!Ws7QL^pzwBwJ6Hz4VjhlGz zm%Vlm#*|Cqg#)AH-<#R9_a2NKR<&ZFJA^|cVQ$hQS88FGw%4DHy3fsQJX(&h9NPY6 zI?UFX=FW*^2TAyWydu8?8Z9fc8ZQO}isjOFJ(nKyyHZ(2NlR zw0T54ZG?H!ki^b5oL=QrEGyOO;xOcEuJ~%At39;kqaN)b? z%=JspJ~y4EtHWYEii;oc#ws@y`c?EkkqkP#r$+=z5qc$3%3okVk&bI6LcvP0CXs?(7 z9M4N%)w(=b$~HK?&|@>2?{g<4T#~!R$85l`Bdrn$aD@UGrhSyq1Jp>Q^>OCfY z)vB{&x|rJ2T`~|9&~;sPLw}NMMNanY?1_{w2J?~1sye4*ydI46dGM@n2ipF%yvgHh z6F5>ulxPza!Q)WMJ}v<$JOQOladC3?MJao_cv5J*SQ!pD2!N<6GPaq+;F6y=3PVBx zC^M9@6ADA#iQvl4C@k;;b3tLjACVghi}?|`qp;W?kp~Kk`w{KruF@vE{>BE-?yGAs z_c`?gdjS%N#{f7yjbFn6B7#H|u(ME}DERjP#DjP|D@J1B5eOKjil6Ti2t<~<1Ol0` zi^se~_|BsZX-KgXuvi+KCK@7u%vyfVhb0i0KJ3D91ZJf_V;C}?w4pMKl@9`UsiGQV0kD;7^6@_zwcOoe#CM{%0Ww zkaoV?E{x1N{2<68W(V3W5Cq9PU;5{K7?23<9ykz#WqkyL`)O!)vVhE~-rY|Sk0r8V jL?X)@%uoE^mT>>SVgiv!q?Mh}fs?T~I2wJzSO@+e(3A>+ diff --git a/plot/validation/nvprof.simple_matrixmul.l2_writes.pdf b/plot/validation/nvprof.simple_matrixmul.l2_writes.pdf index 76880ab6a3f691dd33605ea0507e7f120ae5963b..b63e1c86849744b682d9ccf51cc63a2644559b71 100644 GIT binary patch delta 4097 zcmZWoc|4SR`_7&{#*$?i+mOOMGtX>x$xg_Y?E4;BG9q(cOZLd4vO7Q9*eYujQtH?u zAxlmn9m*CWrjzoH`h0%xIp+JvJontsb>H`OUH9{SoW@X;!cbqxKz1;L-rVQw+_Kxq z6;EJ!NmVyKTbd)@E*(%CYPf(1wjEW)7?zl?_&QXi+8b)FQ zK~Y51CS_$8Z#Su>5evrd2DVjo-Mz?$`4QK3yG(-oWHxCeJ#%w6^d+?g9dlG%U@P`Ai6hGkE(GM9>3gtNr#If&(3-+DA zQ(AESxZJs$QblHgU;JN|#fea)0+H*W6xWN&+$T3tQXg_7FKgK}&S{DkOr&3_4NOHc z59^a1z*ZP}?eg7xZd9U2{Lyt*jT+I@T+3JPWL!E?WR5k-WAm?f^Vx;0O5N}WSVo`X?X=hC6M_4!-s8jtjl809^W!CM zHrS+wPmZRVEj8GWn>p7!`dA7yq-u)*%9Fd1$d9>5a(hX60p7~?>Avk9Z$WOImj317 z74PraA;CRgSKnvb%^*!?%Jxq@wI~(9dZoVqb7fQV{mSON)aIajul$%SQhvxCqkYDV zuiw(tPp~JEgj0vlrhjX5QOz>6WK&+`;B`}SsecUJ8J>=+-MV4upXe(2$vQO$g=`wv zF}8e%C7+mPN)wo`e;{D%mtLjb{rw|X;X;W)&dXiJ#F4(`*NJazRJk7^uMwK7q<9O5 zJB{4_Haia2R}(yE*zbC#(j)FolQgbAja4w!x_rJiwK>UF@7aI0jsOE!g7OR$kJ*Jz zJE%(c?e5klZ5hi;mM(2qQEMdD{?CT@(wzF~((b>=qFHKF0g_SowIhyY>h)=y`6VxVJnaetyB1x1d&MATbB%WixdWS~&BH4;pQF_qnt1y6y_2@FiAAO*{k+Mz|zb z-5~lv!PSDReL^l4*N^KcyBhSg?K&sP_OYz?9an2}B_0iBPz$x@5pLUh@Ee?{V;)mJ zOnXHZS-2&||Aq-!t#B#iMNUEwt%H@I^*jNWy=x+t$m-r=J@=PE~sFTXrEh zK+M28n%s3Y(7zta@FfEwTrcBMMQ(R3O)lfO?3u~+u4DT_99$%^MGE@a&aj~`;X*d2 zLVupM{ye)oGp(J$DO+gL-NL3`#z$gn#`rl)j%OJ+FWy{wRkWGnaI=XWEr<0r??rR# z>PBwI?SwnBX3onQU(tzH)lbT>^lo5X`^cJy@tA=q7eVKo0B4f_R?`$W?9(mVM zF5Fz^@K?+$S=$7+_?_^&>U!m@9?mJq$bQY~LzG_h!>6TB^QJ^pe9b?=wHvF;Xz(&? zLmpV&@j~wDu^SQ|J?^%noYP}66y|$sD=ceEc?M;8w8Y8j5dffg#+%yHP)bM(0VR0n z?;1sMnPw_)ipSM4SLc~fAfzb84Z zXHb?LkWcb{qDZ>HgYCIM6@vz|ZAK_j3PEgXr>eqETLMs>pMQ&A`qih|H}tM?gS>KQ z&sO1Do+|nv?Z-TmTNkoT#x&af&E{rveRG)vKCi3i-j5G+)b&bl$@4XXj!_zR1k*Hc@y5~ zbPZ4GSKCsPKD+Q;l>1(w^TYAd=9I=d}bo1MN1f;*;VWmV(%| zurZ%nTEm#LbR)z!?W?Nc=8QGSpJ~~J1=29EWwU6kfPPEr4iIP7ayWmEQ^%d4)hxtF|ug$!%mL-!u)*&8j-9LTd!QiM{R0*?TYVAxeY>rMd?SKu^lVn`X9zCJEeuqYzWaRb zURl2m<$-WaK+4M~yXq3XTTVwd%aEwR-J3aLdd~-?R$dCK9(&%fMaV(wg;XS8C0Iwr z{tY;3<**H~&;MXavBkZ35jw>4sqOWYV)R2!%gzp|FrK9R8(7Y_3*&oBG7tNE;*WCABoSkT zq$9mjNIW9p(M|)>DNJ%}om(cvV6UOgg+`S5)ES+Nr)J4Qi^OB54JKd7dQBnH6-Di7 z4BL73xs%vFQmEBJnzx%Xj_SIg*hy-R=Gob}J3TN~zh(lgy=P3iJ?k2<#Qkt>S}c4{ z<+XDx=RG+)nchGb_3-w|jpFq7NcKpTcbm}(gn`;-Ee=ej(NT<6=LO%df^wfL|G+b5 zs&}ql@f!l`Nu2d&-wlGvAM!fm-zs&wMMe%F1$oRR9h8T6d1THlQuXKeI%XMy*FdT@ z{M9V7VuI68x8_UaAwpP7QmRL>%-TGShEqq9XR7=H)eKEFn zRD37XJ0@{ZzHV8%9p;!KS<=-WMOKQ~2tvg4e6ENnDJgCR5>K2>WHjNc>@D972`KOG zGKkZa|GSM@Wq)^Ne5=-OvK1{+%(KsC;}hxzrErNeVF_SU71s(k0~(UzktLg1np|W7 zj{f;U;L&5KTRyAMr=})t>3b_Sea1>yL|yywDWy?h{mAiz;_{&j zzB;O$$y`3H_vP%vMw#*Av8q>(2A1_Q2H6Bf;c>|mD?*zqTUEPMky$dcbBZ$w5)wl~ zkmxPwg*dE9T`Zdc>V(6fauGR*72XP&Nx-0&@T<@|?dd9!3HeLHAVx`Fs0{&Yx+fj(QA~=2q0Ei{v=yEuK4u?ZwX$<~divv(} zwKxEchw62ROehQ*TGf#wqX3$Qf7f9EkggY+h`}Gmp+Ou?;lFD^fN&Ux!q9W0O-Ivw zLZQh25R1ZK2#3j_Fj(Lx@LxDQnjS~M9~O%OF?0r?Kmz)oq-pbM2TR+*Atfj*h&${8 zigq9AaySh7@HS95ERN0v9FZpB=d}OdE((v5$I-s5gY%ChNTdfs4X0sbJONMV1%c+! z;SK}`650uwnpQO)=;?D{a99pGIX!cIj{gEV-*;I6 delta 4415 zcmZWoc|4R2+fB+&mKkw0^L}5<{l{Ft^SiEd&UM}Are0@ymcaBdiwUqZ2@t8{YkXC4$A!cj z?;9>IEo*9+Y@b?CB0|7-aY^?}+|s8=|JZq&JuVvlqMZ0b;kjgQ;PoSOEFVnrR$$Ni zMrjJF_Sud*>+9WH;|Wuzeg#D$UvF_nP9%oz9B!uRfoap5FLGAs82*aQg|Mg{|p7ra8<-rclwi<^NQM>tfbu~v0y?v&T6HvEOAI~E79Q=^stnI3e!t5 z!@#$^e&En(-x+@McAXq+4VI4-gO`~lQ=%Oj1wPenAphl%V6`qc)z;ZWUHP*z-zP^h zQu)2F)&P+z$|3lyAF&;(XfGjmWrj7Ey<){e^=&nzJ9&8>m1`}RB7!!-{%@-wsw44 z7b9C;aS*_mpfkcHCUeRowzaE8FImr(E0SBdDpRJpu7Jz|sOBKRh%)`sYn$J1hjz#S(}3pBT9$~MoOeNN(<#|At=NaCvfvo0)o;%(Zs_0OARcNoA7lWyL4d<|9lK(ice@%`1L^MED+ za%$@)R5TTtC}pD0qukJV%&p6nRrUbR?9h$e+T{ir+NhI-af>-fo#hC>yH9KT;#F?B zSg8-l#7kV?7B`0}KEP(f2AhLu$imm-&L0Fk!qgNzgoQyrkLA(t-0l~UFMiXWW1<<-^5YAkZZ>BsbF@>mrkE-s~Xq9`#J(|CyF$e#VTNVE1^x4;7c@eQl%_O!jahLG0fhDu*wIKW z!;VcdVeELmc_F7E4?o)Eaf{P{Ql{kZ={-dok(Bd-Pf{Jjn=d0G{j$GI|HX3@{p0QByHqlG?mz3gEgr`-Hzte0*j zIkMr%nPUtZp;p${BaEFcLQHjq$L)7jc=b&+oiJ0>Db2|1tDz^rT&?k0`nTh~4PX(&j!hwfLn%HS+1! zGEYg6^0s^u_T!gpE$X;KAx&u23n4XSITD3u*o$W?kH4u@s6WZU2B!b*q-c6u)MD9o z%IDTr`QTSeRY0rNwR`{`JTFL&`Ft>L#MF|aubRLoWOnV+RB5ZY81c2UjV%!0nvC** zt{?Qel3Zg)&@rD{aMlCcM%ShWpBWg50xwmndKkzXo+NQN9=VIU)1y3VrLiTgLZ{w+L_?Sk->wZk5<879u~XAB$i!xbWV^$y`=6aVkuZm=V{14 zZZ}GoBb%wdG_4^ua#iq%SVt>jY7bQKP@i}&LR+E>tkXI@8m|q^83B;d+umq%h2q$Y zUxRwG$SJkvZ2^1^qROXaZ={25ubgLnWskX(IO!ZGTwY{fhzUM;ultUjxH3aylA#eT zE51?;b%>#}reu}dczQEAh$~+n8G&Ez^fym^_mKLN%I?)n`adcI#13 z-Vm4?YH@i_86^NNoXhi*4ZR)7{K|4APMUK;2oVWu(tIq*l%UvERbl3t9sa(Qz>>h| zsw9n-`}O4?4h~v==jNhk?eYz3D|1hv50fU@uBCVa(PahhkM<*kBSeT{=%~7o+Hfi1sVW^J53+t~NIe6r8dNF=uTec(QIh@c85Gbf8 zcdIK{ffpIlc?Syy&mBl_Nz{GYc&SU;;-8b@Ds$oPc_IdDMCT(aAv&PJztfF88*fF7 z&m$s z(Thk4RyyL}97_UWqh2%lxwjSrB%gRyn8F?0JT|)@U(wUv(f#E8OX}6Qir)70_c}gy z-E#R8bh&A$pN4E_?@ZX**y!gx;cW+1`3WmPF&X{wOYpO|19bKB%?R#88v>1r03pAG zIxeyr`E|s%Np98qT{zfnO^5H>DULFKuUbo)lkqc0-85$Tdy7mH*|HKHErM^FKO`0e znmrqmHR&-8UxoUXrA}oYN=O)QDIjzG z663g(G}m_Y>9>iBj}2EP{*>1CfWsQpl~0|I6YYWLI%e^{V(8Ch~+hh#84Q zkRei10Kq6(8qW5XE*Bn^-iqxG`RD#nLiV;_=Z8jXiu$2MpOTVF-Xz5*_k$kNTgAgp z9UK+_GLH2Ll<^lif6P7NkW!JPnlGYPq~GzWHi1s zU+cdfnULoHCG&oiigkx>fKOGalJ+J6cDSpouZ~!`k!x(eHf%dx=q0yefYrXfJzIU= zn39}jtlh*F!^7@3L&|L{lUYXrzN_0i$}W_{UqD_a$79%Ty9;v`PxZtb^|Du~R&H9I zHInl1owKS@jJNDWYprF?=9NIBAc_IRU=#6WAn*_vgutPA5rTy6RuDxG2onCg zB0`Xe-<1~xiTqu8Ly)N7l@A1o{$1^=RBPbStV}TSWi<^6m>h?q7Rk*ykm%>D?HuR^ zfodxuU73n18JU|Ghs914Hlg zjzOasFZ_?aVT{?mH-lmD`|bn7BC-4Wuo(ECrU3>w7Rw0j9uX`azRwH}hGJy%Pct|q zBieg0ENf8VFZh2LcJ~YYlMs%_?!6Wq!FcxeF;cuYM{ooRz2`ETFgSXDxZr3Me((9= cX#D@EZWg+pB|&p}R~ zhQf4xTy3B*84J9Hql+CBrfFek;{ini2-;9_afpqxHHe7)t%AIZGaiZ}AixYXEYI0k z;h~rxk8<941s%Kv-Ucf41E6Vv$J@9&L($+P1g2weVeQ~-2NnML*4@QQ#|95I2AY+X z2UxN3#zSFBP5=wiKkqU>?@G|Kzp;bZ-2sqqkb8Fav}-EAzKAl}(8 zjZ|Fd6sRBe-Fk23e(mMj`(c_-Q0e`l=DcA%(!I7sN3wz`FX!gYWxTw4&ZX~#ZCGow zrOmYQ1-%~3&|0SWSGSFi_{wjKZ-`eOS1;M$H!|@uH|?G&bepdA|BftbKk}mUU|Gl|6z2OJ%W0#`hRSZ{e&%+HV#7SJp_=)8O6KMo0yAjdv zGvwJWwI*AtUy3_<%DWIkst0RzZ8(%QB`oVEb@z4o<8&VTm+ayF(t)ufza2!1n%5vq> z#{9xWtAvq}hB2x5XiK{PbAMhuw}qs2K7UsePIU!waLP$Jp5|Z!rjUP`yHSWU`9TjZ zMCBpbgB&4N#e3tnlmRtO3J9(0qNi8O*{-jNQ**dQY$(dOO~lLC&b7V@M_HUN&Sk4s zYm^iWe-gmMeu7`&Nvx0!G_+w+T@uE0bFIM8G!+#^>V*+ zuQK#%>?>sIK0-#KZ)<&fNX;i*^_IWWJCm|liRbGhIHl=*CI6>ZJ&(yWXs_$Xs_K## zm&T_jBp&HLtLm0X8DuV;IKnL@rrFa*rcnhTIO>u!l@7a4LPc>(;euIhN=u{kmWLNv zXl?kl679|$$y-*EzfsH1xi-L(z3-aZlz;peCRbA4FfUs3Dvoz*5PiktA04z;jpUST z@ke?2XndzJnXc5cfxoJVI(Pd_@H8|BEoE=Z(xLn2aJq+_D!@)V*QbNZV` zK}OW|716J2Q(K$wTA6%kr{5oir$u4Ta^qemSe5t^rRE7MpGf!2yCFHdACGOLya%jsG3pHCK7~H~26C2;~uz!)Jry36~kmr7rwzce(b&9#5h?1Xin8PRf~Q2U>-T`O1&KFFrzKDkAm~P@<#Rz z>*w-Jw6+%#&(9>UG(_M{)bz&QzlwFo>UrFORNYp~E6(vfs3g)k6cQ~wVjyuZ-$XR? zYM&O|TxUr5Zhbzp`Ve!hbfMVt;gX!SD~av_;d(=ui^3dFmQD@F7R-3iL!Ro^#w;zK z3e!P{mATHwT_QEps1JEo|62OyWcz^^@$h?PCZd;f`}7Qyb5n0P;a&7`*{9sd9`UZW z>YbEZ9XOirFLj<9;-qoCN#~?dByRtCV-H@j`{4scXI|S@h>ZLv7luG>VYc*wrfC^OGgj$Ce`*MN|arrwWYTR-m1p5Od| zLEoM=r76aGD_+`%jki>yJkT=ZLp1x<30BjJhOXp`&{>*~8x#kR^N@5DG4d8F`2@y? zsEnQ2mm)9B_Wt8Mvn`$J!9JRvLP)dT@s^^mw%sQS2H;va0r)++(izztdV2Qf=*$nW zuiPE1j2e;37CBGnEhbeNHikSPTTlBc7Wp0yj&UIhnrL)Nx5=!&?PsXSmb;X^;9a|^ z@95^J`7GO{XQku$^X2E$FDNvaoL@30KCz5{VPt%BY~hYw|Bc7DKT)`u7;OtN*N;B$ zv5Xsxvl)C%H$TvYoe1|?oPNW1@QrTK-HwPy&eZCk#AH7-Up>@ZNl{YZsd#~lOkvcR z`GY1!Y~fg{fSI0PzS36N_qB|1Cg$8s>ra&Cv}%P$@y0T$!EAMa3vl=32TjX%v&1|8Z`-x|c}Q>{?S-3m>fi z?wg)?IQ5wu{&hZ9NGoEAvgO9fGv`e@`@PMooqO%cmFB*M+F3j*{}_^WbItsP582!A zmX)XdynIbhB;rM0`#%>&_Q=k@Pj`!opfp9jX<+U)eq$8d?9$qz+=o^@jq?fm>Snm| zx|ZEvo;bf}=1t1sVm_g0Z#Ckyq?Pj~-CAFq><#84?m2jGe11G>{qdZS;bzO=Qc$jj zVHfDh{4}~heHwy=2CX&%i^J|XIt0@W0*#?(f5AZS<`46Z!~M5ums4w?X@rx_XGpUj zqZoO`ou(r7+2t*_I%7f7M3U0&~-Ql$hL9r`xs0oN;-k&U;FXbMy zl*;Gv6skB5P~6UU_q_bObbr2;TRBvtG0>Sk`t-Y_w&RJQ(ErV=SX?EI z41!F?Pos&^WYRZYV(bEwMN`{nQRSTSD!`7Z#N?j<= zNL6JzDJo@q-_x1f*EIKDZ_4@m?epXxf{`kF^k)yx3XA&h?x16uD1+e?#M>Ds*`KK$ z84RGFB-#wRuC^JWVP|MjVU+wa*Z%r}waf5Xdc3M{%S$?jsDVW}?c-P7xbn(Q7zj~V zDF`|vT;{urEN)Q|F*ED!W(^5Cfq#q&6dLp2yon{MI1?i%WY#?(XQ9DsmDai3a{aAS z-^av#g8BEmF-PSo+NDxNB&Ja+1-|2y@O7X-=x+q8;;H&ezbOl)m1NzAG0B_>cS}FD zbgz5!QzoS)e`oSCryg^=`Y7ih*W&kQ=#BgY!w%iBZjR>+-7f_WiC)NaSr9N4yealL z{mt$1w2zDZRm!Im82xWGT~*7kqK@|P%6RdX@mVTM;zDr+=tuHlQ98SMY1xupkX0fBa@G_^%*L%F9 z+{K)f+2`5(to#M_j9HDfC5LCMeBE*htPo52>L%KY4RWW%)wSJ5_D{XPd&|JnaUOmj zcPP)v=7_XM-fX+Y0r$E05i!qJ-{X)XS8X}7({f3l-s&=}k>D9zhZ?m8@znrj*jXd$KEz)JjOFesN>-UJ^=fn^a`kRT7q;zu}P9eK4P69KR zWahkD90Da@->UaX4yt;8Gm$5;wJC#B#CFtI+C3JWWj^P3epvRsER1& zEH%Tija-?#LPp55mKnNsQLKV`3)FRN4w9aYvVOU154igMVs47F*R3v$l$Lf@a422z ztoHVvSa|K)ph3&!rE{Xe*@@DPTIN2#z4a}@x8ixcUH9A~s3ZrfMUp%;5(Vao((tbN zbX5jZhW<>j&lGd;cW<1k?xqMZ1PtXKve?ki}%_6*WEVvGLt5U~?1Z99SB+$0z`UvOnf5Qs^({tUyAy+d(#pkPo`o4}L%X?ZDiq zDpy#+_Ic}S?trLDfl8W)h>%XmSHq~u2LEmPFS z+&a-o2c1a+E2s#C@bXK}$f7`dC1+iiXz}3M; z|JpEyjC0%-b#otQw+=p;JYxIG5I>>1x|pW>rOn@NR2M05rRfW&zhSlg(6IkYGwPXK zgR;jF7edu>Mh=&rPw-Ib1Z)$hUNOntLw;b`ze|39F^kYzO)&=$WShkhMl7#6^m-1i zVhJ(4eL>AuAJV*^;z~<4vQ;_oS?h3OW2zZTs?Y1~;0-AQE(_{hPE)pFSB;O3RccK#Zo*%-wzwXP0{X zqyS^~+LV0cDUs^Bc*k>zF9Y$=rHZ1Xh6XJf4O3#Vi31Tp;qbv8-z>&qs^ZC3?m zxeidJRg;{yJ~6nzU5Cgp#z;)REbh{o(wWa)g9D=tEJT{LU0GFk`pRCma9g!;=1P1G ze2V^lh2+t}&c3gCS^dLI$u z4b6@o)HOWfaKpJ}@@emSgCQ?_=J3T&Xwwr-ksIRuFZn6zMEHpr$c4Gesn)2SxUz-L zHO;u`lV2c_%93hq@``!VE2XYI&CK-($1fLhneJsYSz=ySG-+a#3!B_UnZGel>sFO#xh>pn(}Iq zcS#HCgShWh_*yg8W|{^6j#%XK0p1MmjYTCLea#Gtz)&vb`#z^PK2CBEUY%hts;fJl zzF@}Dfsy*m0P9_ox*xH0Km2><%n{zr6;eF0EA1Y#2X~ zd?`p=x5v%8m?TKh(}9TgU5AEgU}iQq+tV-VZ}l7dLWWZK56DS2pU>MT`Z7tI=?#C0 z?O1Q5xQwzTKdJipw8;gHv z86hnE-#(dId_#;3f?CEyuZdQuK2S_Jn)3VS+)NR3(SCMChrl$Xr!N%Xv`?>zqK`ksGTmFNN^D zYuNEa9}Jw@vE+QEYjRuxqR22$L6m{jP|750Uc9^PP*K+`cIZKJyRP={dNG z`jyVnyF8KTL#44DSEPqti8-xw;|A?P?osy^r@BMV{eqg2q{;Ky zhi5;>$^~LRby_tG(uxXvi5Dufsn7AYBp1}(e?K~>FJM40?m$7G+?x8`D)ynA!IEbW zyXNF#o{-WrvRn#(KF{!Rex+pn%khaI(+E@VJ%qVO=!E-=#aS(;k`jpW4)_(BA7~6l zY0?!oZq_#$R1Gc{2YEkXKGf=-F0|0%jjCzm@qNRC)Tc22^vSZA$fas(mgimXtj+Mj zO7ZofGOT21V2#N1n9MxBD6Aw1`_*#FkLn6`0j3!H581MeNl z!ev$WoSX8S2#@WFT89e-H!8_S4kj8RTf(w!Wh@1hU>p69Er~Pz50B+17%MTVCr`l} z4(D=RG&VD~ywZ|R+sTyojPpZ2?<1ZCqGFt~Yp-l)st($K^t+C}WQ&W+!G{NjuitYn zk=n0Va=1vxtqOj2U_#C_4*R8hv`Ys$<4!u89K%6wT~UDp*p@lSvfW8PJAgxMijFf zWj{0xs>DXhH`agNzDLG3Iw*>t`W3x#I+xG5=a%jRzH44}I2XMKybKPw{o>?FZ%A@m z#8ee2=SC=p&MZu?^(eYnj6C#%&>vRivA)0gsl<9+sCL39XDcX0%=GOBQB}4`O|9eQHkVtK~CLBdXd!>jAGJRzl!h6L;@?mMhb!ow)iWDtfMAa0*HOI3rh1&(E zL!GEwp$xK>8P`cz`OF21^`Q88-=>Zr#4gwLQE#y6}=>q*a^bP(~JPOqO)zO82}rx%@r*1+S`vm|NYh z_nlHs94MsFyqHSPLb(XJ+*q3oH4ZuPvW+^o_m(^JJL&f9MMk`c=SeCRy=h0K=b^Au z$IGqkl*h2)`TXkJw$6e5&8&542iPUsOIWf+OqkMHlCFww8jW_m`W9$$y0Jt}PNC=V zgVAYKM9YZI+fgDzJ>lM!Yi?~>o)S-(ygp4Y#pG?pyjrF{Yd=N*Es!^qCJEE5dJ5%D zWa?Da-UWN+7+P-EPCPRgH2i^7|ATGJ9@5;yvqb!j;gHi*aR;{C#y06h_M2JtCaD)k z0h>Tw)|rCd11R?Rj&bUR*W5-44?h-0Z9Bvc4`3hXJU;og3Mq@O0ltHc%AcF8cmS4$7)|49UZm{nQ15QzZpLg zJ#87X>V;277#BR&=B5&r_k@QzitxVoWYf=o_f`p0c_1@8+e=h+j@2Krgbg`^U}38F z7?_#h%$>J0eDGoO>+Q|2TjbR#isgGqV-F7uDfBlpPYdVX0;ibYg7`yG`>B-t<4~f! ziQDx=?_NUSB#Ha3;t=McFlP?#0n9_6QRGzf#5?iKFZ|_~j!Joq>pyL3pk{!tv6$qW zwU`V_>uJ1!`lp%tE6p%Y$EGKLZ`wpYw|EvCPGoydfBmUXktXDX=hG^`%u3n{^JlYc z6=p&@&v_Cq;X58^21r2sX0r;_OK4doq^W7U9mfV_a<0U|6%kywL9^q$_@z z?wuR^)M1Q!>@K7BaFLMczX_tWlwuFCQQTXB1YP5(lJeGwG4d$c*!E;D5pPviyibjI zN)w^7DXVzW@wGyxEtKJV<}8nSzSUs^^>F-nPb=G!t6B%+y}9sAYQtNx%_eXBU374z zPWr4X!AUiWH(sd?VG^hOI7w$6Rn=R3dLmV>cdL7j9mz9gS&A7Koh+=lIVH(bxH{bR zGLY%Gbm+~?MRgMLu2J1U*|sounZ2{x(iLHRs8vX#ofcm2Z1zsi ztB8Ty_-f(3c+Ge_%N)s?sZm;!W>>Cm>kqmE?PJ$-%Lde^=Bw1TE@{=qy{&5e-cuZ~ zy|%#^l-@LRLcr{--yWLY!+%4d|KfWBT2qRan2qe-CL{-X0PoZ1B_cMmZN}v0hhRI_ zu}|`DbSG&MEt!vl70ba5;TTg#dJksdR>N8R$5P9)tsY%3pValWOx{pR<+0}8()vOy z_}<7Hvx<#$CeeV5u^<)P*@arJ9!fcGGRqn|=G6MxIUS~9Yf{@pq5 zW=5xKW!FQ|;7xS2=71}c#t8jTfMX@1 zwJ3tucx>NF^2~tjY1%m+qI2^2R{?qvi?q5PW0XmYlFe5n)t*UyjUS0Fld0q7$Ubhs ze3_!!R@_4O?Ec<7Z`wOYuo3%W&#lfbKF<;}hm(uOW?UY~k!lQjNoHVN63o96QggJ2 zj)i!=CS6GfqGw@3c6SYVqwK_%Dfar{)Y;W@TX)M98n0Itk2ZVFx#-36=(Z((Z|0}) zfPATOj4Nr^T5)nc9y6N6!8Cf4ys;uDwCKJ%pZKZneloTdS-KvxQ^#sN&N%qXkJo-W zBXMtYWnb?64ANfli(k;O{E0^ms7qXXC}?dXeg+FJia@S^JJQR3&88MQvn7+aw891}mhd zpNRO>87!F5BN#QSR;3zs?RiTSgKA>K%h#1^#pvR$IP;lQCV50{ z4{w|fV$J1Sz23|uTS@jcdXch+f;^vd`aD&C*cjA0)`IjihGRiEj`Ex+R{ca*oCaX6 zU;!d6#&*(K950Af`0111vS?PC=o*58Bj9=;|*_%F6?<>HkN%di2TsvWo;YM zII&9aQ^&kgT2eXc+x~HgX*KKoR(hGi350@;(17YOIdvWuIg-_!HHll!!eBErLO&AZ*R zUt_wMV$j8ed+#GD#vDFdNji$TRlG{NeR9V4(ir;*Rdx2G;)voSUoRX^?Kx$iFzZzs z8f~O)ce2A}(8ti~@k7UW{s8`bQb(WEb2AT#hYo3!B)^K98t{{0bYXn=O>@8`h|kRV zZlRx=MptuxoFc0}Ph~=-8@=PLs6krF4)zNnerK7To+{&&$9b=Zx9AOAAwAX>(ct8` zROY6ZMP^E+H^oC79&!S6`Hh0lhx$#cSL&}?w zG-4`0t0C)fY4V3G3FR>D8|G~t2#jADo8GlGdG&9P5@=f+Z{60z*JtTM^$;K z#yom$_$rw{-?fdNen~9lZUTE>zX3rC!P|FTBaihV&0r@B+WmjFc^(}UvalPWov zB5m=_{^Ran?YhQ_^2CR6J^f`Fn(jfVpGbAvJ{0w}$yL~K#cErec;kQFBQK^(RP*5J zqYE2Rm~Z7DBtD)o9@#^ZdpO|;%->kvc&WocfJgknCsac7kWBEDSj7!iuk9d#PG`Sx zM;avd4(S04@25}0ngo_D^Z86DXS|*?x;2P5`O2ulnG0TzVh`Z1A&)AT-I^LLPq%r( zqGN8T;amQdN6~A9g2yE$K*u>o>_zXV3st&PXYw@L+q@Zu`^i5Xp}%o#Zm9V2dhRp# z^Pve~3i>l^=|AK7as2#Qr-M#P zZ=3qnC%FuLU(cF^e&IR5MwZ0dIIVs~T>!x#L}o=Ludd}cq(Ne8e^}WLt`Kte*X6F^>06ZL23f`F zB=1%z;`i0@N6t1O5gR6L;bHTmq#!S&sAt5dXyfQ*gLkm906lGY54?=Mg*z}hVX78C z%SbdD0t2ha+IU#GJGkOq+@T22XaecVI-ZtzLP9v$1TcWrEu4U~>OU#}s4o2@lUe`= zOfmr>ECL`RBn(9$kVr6h8f-w#0`KlX$hH=Q!$EHo`2AHn3xR>GX#z>$KuR^ZWC0o4 za?U`8G?*g(Pip!O2m+IJu(h=T61>5TZeu74h>ks6Ev$fKbHGdTwQ+ZW!d#pIZt)j@ z65ih31}NJC3E;s04qian17HOWxVJVAcJ_EE=>JGTF&H2#9Oxz#%qr4>;sBthv$c)8 zhn0)F4HVqQ0mNttg;@bf-@x<%DMDdDn!G&}<^X_z_vfK7M<~n*3Uh|QfOcI00RSs4 z(5MH%1V9Z6^8%8=0r`6a8D&7`H%#Bb8gCEu_(ux;KRNaU73_Sx`bWdRDTyG{y)^>J z+tI?#0}#sJ=tj!J3QUv7V1O=x^uY%KB(P@P1d{N9;{gs{PB7D67$x+-UT`Q9gC$%$Sm5(c`L{ddxAI=^ zf5HA~Z-+7pjfSED&4U*fi-QOQ-XH>44N)l6j|+=~qOn2*EV0CwgV^Kjt60{{arLYq5fkO$xbcp%)dzt=$v6FNws1HK1& z1?vH2kVo*obD;nwgC8HkJ1789T^Nu@5Dwx3v_LutI9TatLezXic6KG)oApmuDXoE16Fq*&>7HE6tBJ=`O&>k0o`(GpCHw1tv1SGt35k>>h zeZoipN5gLyDCwOM@(T~Z{na1>L11d9)4$(=sqjO4Kb5ibZl~Ry`wkqq@1_L^2JGZ- zI6uleN(S!#B+~xwmr4QWZNN1jSVN#dm`fEuLRy&K#`#307{m?Aox+T zf+7j4Cj?lKkv}chk5vl^nn?iH78r{^OW@j%{Mi)293X$TW(Nhl4FDSgtmcI8bwE7_ zDC%cx=RpHQpyddt{71Ix-nmI4&rp{SpI z@&Ja?kGG&D68hQBPZLIH3j$aSp`UI2Fkrw0BG3bE8j!9?D0@NyGy9|TkEtR!zd&-R%TJ1aD3Y2^Xq{e${H_6Tfw#~eu^emBU!PIVHLp3;qjH=>I?jj6OgSE{Fx1{>}0L&d5J}|1hIFr_g}o zVT9vo1SrssriFtue&<2g!4dE~emp^7KYbfeU%+=v;2+@87ywq!6)eLHf0iJ>9WevN zNbqHBfD>YWf1(Yrpg(;ikZ(93NZ{Ia5bakg_us@6J&41P0T=%1RUOw$WW;StFADvZ zo_Z;z@jnQZ;7Sn)$l-h#N@5oj@5pYgT5lU-bADh(cixxyK9@_^jGq8SkDvI4T+Acc z=B5+PI?F^ghSFSWksSr+X)+c}Wx3u?BG0jZGxyiuJ3`b2QZ>S`nvwC*Y z#PDElawPIx!r`2rgAMR=T1QwTFTUfCEJ%WKrd_vq%aNW$+E|eq{(g#rSU2vzmX7M{ z#wwi(D#xToBbRa0{rpWVj0XA^nF1wC+d=Q;3rV;3Ss03n?V-5emx=#xiUZdA|2&|{ z8G``#L=J2QkepaP6tY-7CNw-lrNnrzfoq~~GMrwTU$lg0-bX6#M(u@_b26*gts0{% zjCRBQLt*c@*6k*88Sa{Wl23)j-#=8_m|EK{TXpi8^eobdzv@u!Q(fOTC$Z!~0|zQ< zoP6)6n`52!4RVId7H_e$dP6@pQY}_-?6Fe}uGBkoNfEHUKP`zO;AH(YS&D$W^TUq( zv!4HEeE$Ldwq^1UaId)vo(hJkI9P*g8({B2yFxf1>j{JogvI~osT%a}Wga-Zbg+bap#-sl2!5zN9`EWQ0)sjIdL`)MZpRA&7g1|ZE8xuR@3ma5ZK0MH zR_B3Meo_E13FIAIoMpii$2_tkNH`LOKp=#HI24CK^1+eZa5y*bZ??hP-NqIIdU+7Q z3H|v41cGS5mfJ#q!N6D){DC_EgdxyEK&1Bv421>l@xSp^+zrG2 z*)BkQ7akmr0+sx)c7bk#9>Kq1a3maX1pW;}qCo%XUoZslb^cF!2qc)m@CP0O19}Mm zt_yt5`x6h~1xUF3Hy#}LNW2>lvrB$(Bo2`JA9VrOdN&LUIywL11uldDmf=5PLc3)O zfOhc*41hmn02~PiBE&!Fp>d$M`R{gx;W%KX{Q*OR!TcXE3^*hIfMIvZ8IFMi--7?Z z!yvG`_`{%gjZ-)f3GC_{77mu&bpL^e+clmMa3T1vaRtYK-u1ugBH*}PG63fK?z#vh=;{8uE&@#T{sSfi2jag! zVM4pc20{q8d+wu!fNkAa7?cxG1j0FE%$=TKg2q6g% bz$)v3w{XW3mLddN7zzAQ22ME*dC30&^Z8#x literal 0 HcmV?d00001 diff --git a/plot/validation/nvprof.simple_matrixmul.num_blocks.pdf b/plot/validation/nvprof.simple_matrixmul.num_blocks.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5e8b13ab6a6afe12b277951d21858b9d0275d9e9 GIT binary patch literal 17060 zcmds9c_39!^iQ(%?2^jXvy>m~693Ajb)K&qQo|>tx zg&7_y{OeZM123+Fdhe%WqEDWAWVM8BZSRWo-U@e{1!J&E?Yd@`oR4SILTJTGF0;`qAz2m9%)~ z#?#ym)o{JDY|Zc9Y5FhmzS1_OufM;#ls%W}#!UsCgaoj-F()iHOz{LirpVOHix1-` z(bXKyR*L#DDKM3BmbmU{Z#|z5I?s#p>1S%8cZi!0*$mAYgN__6@+Q+PwYXd3P0F${ zchPNG<4Gg?iLP^jK9RhM<~}6`@hhhEDX*XQFHVX&#^ZcnqTGI1trVjvQat_u~CIwMxQhb^i&8#c0gCoWn3+K zEH20T${-`4SLZ!zVF|}jE9vocOwgiHGAS{F2JvXV(Wg+7#p6}`W6Y(3!9=~nm3Zm% zVUL?9A`_a3Xf6z1?M}xAEpiIJAxPk+{oLGnqUV&eprUd~iZ|ax(+VqvKi6IQInBdY zz4C>>$|2=O`B3qGQqIW_emo{TmMW9i&MTsLLZhozT=+?9%Jd|`V6D`&b5_f%a?g;A zmw=H{buFiUn7!i}@t|O$IM(JfA;Di~WWdwb)NhAXks8>Hvyo^sGq<>v93pU9F_5L& zBKrh3-o`GpnvR|H>DZw_NAzyc57a%A^Czib>@km!7jJmx?ryEKdUUayTFRc(1NGbcqC7-1#1=cl-_EVI9J{>!^~OgU&+aAEBMK zEFyIYSzTkbIRAB8Eo0TXm1*10wCfDFd!Cx1^E+?rwS~QMPzkW!fD9((g*et9OIiu2 z&2>&0_~|bde;Q5SWR}=iqXlUuo-9&Ft1-yt5_ja~U<9NZvY2&bnzMT%^9=C4N8%|I zUOl^X4}A~Y%O-AhuVpGMtGC$sM$ygqjc@Jk0)THlIkncrW$Ko;(o31S$y%?GGTE$^B@kX}_ zPq2p0qxTICE~j-|A0ECv={6b4AaC-pMtx$a>B(WdOym50mv^d5KKC2r;0HFE;(?5 zlbes96J2ygZn+Mua~-Gy$@4M{4CTtSEQ!CQnob<^prbI4>uB*ywUsnwgOw)Z!gJ-b7Y_^}z*5whYpmYsI{nxYLyh>zy*>`*4^v z(R--Qu*Nl2!}G*X;`B?*G6hmRsncgf=`|kDN=+!Gk7tj%UaOw4ZK^q&upxaWAT`!7 z;^7!am{eV!tZ0I4X?pIEdnL2jT6qJy)6#(-Wi$6 zDiK;m9enT1`KO~Qsj}dtGsb2d?q6&IoaD4Vgz}DGhi1$bUMu%L5}+lP#8r3=%#^2v zD)k&~Z@Kd}(BJpS3jInBm&1zG=up(X$5jUx-C=W!Rg|hMfp1Gco1sS2S?)6oGWH)2 z=B#4~P74p~m0wQD%O6X({LEk^OGLAp_z#{zW`qiS%l67Azt z`SOLA64q!JMC$B5h04@6Br!#;VCJSRg%tWylBFxPTif2`ToOxMxmf4%KraK9bF`j;`+Z#gwI+_@K<^N`mn0L8Arp&}qI{GKg;E}whERPr{D8@lL7 z1@?~GA?Jinz1&v5noDw1sXlyPi!-T)qo6Z;wCPW%wu41bnE%#QET)7;8bK=Ut=2$l zEb;S)TH1&3) zKt=41WKn9MaVg`{aoPJB%&a8m8#1FUbhOa^iIBn@Dt@!9*b3XlAlk}lP@&XIk*f>he0J$qm)Eb70TgNbUOxB@39+)QI*f1z@u-^exD0p19%6m0fUB4^3_+FX(`9oa@Lj zDW)P|V$%6rH6#cFcZ~`ZM)<#V6N^`JAViQ$ued@Cp;wkm%(J*$$ zipW;5N+u7BPoYrso8mm}=r|QlBc@~|LcfA(ID19!}IrY@yqmH#N=@h2? z&k{dz>M*sc4s-T%Eo{8Ntlo~(Z_^Ix;CM;j@mk=p=;dt3d4Y?9_r;#3zI`y7@_C`R z^z7+42A|@F8!ER;sUuz8(_Xz}c#+H;Kc8E~@M`65$KAxMu}RX7Yue`IXLyclq${-E zLzzk?U&S|dCaEz$UNB#i6Ary9xw6=Bbuje-6)b_4o7sx?^6SiYFOSi-LPui~CeIgh zvvQYJ(`MC{7j0g!@O8+>u|Q1a${J{|R?D6eSJiYLIym{hrdZF-ZVq0HJDhEAaYV{B zd$!f&kjsbnVNoxZ-s6xWH!L|bQnE;Tire)|_@pKz!{2gP2wENF%D#ol$SC}}I5vLF z!^l6yJ2PZ;HCudro#d0(4dA_KSf=@8`uxP`p11Czn|fCxRJEFmc|<4Spj8ZQ53()t6xO4EFei%8M!=qZ~noQ;d16t{pRd)pw-q$e;0A z4t%QSrz+90ck5iKaP2XfxRj+pa=DVoQs;|-s!WH#`pdrN0IYMZZ@oxHz^Td-j|)Fu zs(&}g2@dD12{k<>f^!Y&Q*cSjIGgkA$t&@}td0`gU4MaYa@4~{oq@Ul+Oy>QUC$m1+WI7w*CfJmrm*;D-Tg{7`6L+YsaVpozFm#yBb-hn?!JE~XeZam6yqufQ|(2f2DpNbIrG+XYX z$`7As*NLA`9I+hJ$B!#7Eu?6FZ}G7j)7QRl^U3u3OF%b)qy-rqM|%}k(9+kUZINBpL&5hk`ouJq9GWFaN4j}^uAY7BC_ba8;}$pSf5AJu zj_BHffGEWQ33<9YyIcI~h)?#NDJcHa0(MzE_teFyAKm`ZIM8?GSuwwKbWlcQzqbAn zn|lsT6Fprk)%v{b>4R6lU@o3)2wxTNea%l^DZ)=kPbS1wNVQCD&y|6;ZJ2S^CA&-{ zd0n!;!9D7Em!zuZG!xeo9KUQpAl>Uo()jH5NRs#nM?_}{BBCR1UZO>=rY~Lg=yZ;` z!M%PrXE-OM&Pc#_{$263jJmmE5LFp%b)TlZ7o&8SQlFv?LjkX4d10BzRKmP!KW>8x zUtz@3NHg!#7LELLh&PRUbwN=}S3Ql~FNo`Gt>@|0&lBAJH)hy#Dl1Q?&Kq&G2}^#X zhjlGW)`l(ChHj+K9N}I2N`fbJqTR#x;2zX2+oKWtK@#ywT}+as0V}=`LF#;u7jvhN z$PWKp8J|Xp#pMKHf(RI=4TSig`mI*X+IT-)|HkdE7iDoq?T2;Z)HziHo1|88wM?4| z<8ceObFGTRFsY+Ck=0GS*K=`6m3gU)m5l-~7ruM4xwD59kdLo*eD$?sPWK zCGr<^vmv0JYE!%Dm!84R+VfrYoo;=1z(5lJAz6vWOW6lRUngiXzU9xe9O(*I)r1B$~r?BMdytp+lC&KxhW%B*mX|y1* zVX4vEZ-(#WOivON96Kyc5}=q??uMyv{CKK7`OM}4l`A*j?9t2M*5a;SMhFT0Hzrew zt&WmLP)ob&G|-}}{KSMJDK@@krHdGg_OdJ3P})%7L%;UWB~qTg8ztlZ)Soe^oGgVE z1r>{Z`pG)`Ds4c|4M%O?&%N5k3q#6~52bBQmqbm>GyQN!?kzRFM&o&xv7?7S>e;tq z$@ub@Wx0Gsks)q^C_S@*qzRVnSQnXroc3Alz~jVLZOw1DMV>igv3352CP*RpKANkc^{N}rI$@_o_MTy+DSAx$*wTBx9te1oNuOG+Ku?M}h zlrmqtEJ*ZtnOJ3G@G|q0$K(NVYV1{^fXl%+`H+8G8p$7SRX2{=1VDS>cqjsv&j(;WAi!EBAA>hdZ8&$MOG5N zk={$zozj+({t^7tV@EHfa(Rxrt!qE#yXjtubJTgvOK*caC{C8}mME)9Oj&{A!w|*5 zx%uhkP6bDkp*nBKQA%YV^V+p9dFCtVigC}(b^j!>i|;%E+&jpUAitYIP+QYERt^rI#s5 zyG10!oIw!R@#ZYy0a_S+=pOp)0rYYEm>4mgk7=&JNr`(hd5>Rx68JD?M*q!5(Up>` ztqq?k*f=uMo1??JqPeBJ?vf1E9u?qJ&F7=+O)M(m5%)hrWEG*R{3T;a-@6H227 ze$&mH_qjF)mxkwGlMgj(5*<#v4jYjn`5N3Yr%5LF$a>zrG%l*R!>QIj>ExlirHCttTvXLS|3FilCdW-Kt^Mcmk^lys);6ki6xPMs(;vpPG14ZY2; zx@qa)*W1WanR19-qBV~>L&ShFl{w*t=-T<=wy~dnCa3H3RAl8lpFSR*Rz@@pX}udJ z(AN>_`g+s3<+|IM=Zx-OrWd2K*Q3ThQ5#xM9{uUZ8$^>J+^BpCM-ttSmQj0Ha}Q*R*pK6oRabHWw%kV6=tK@0nRO+o=0*UUK<(z~f*yS+ z_Sm*j>iIX^=i}-=-;LO`i5~33KFxf}_M;RjgDD?A5;QDlJ5j=4yjZX5qS;tMaaRVI18Zuc`YQMap=AHpV2zCL4D=B1^j#d$^xF_jEA@2ir@{cSiMn)WK6Ti_l@qC->w-q7u)630hAHXW z>AjqR;-v%e&U)h+uEKiXF4C-QkcbS)J?olBx8a`C(V-Ara#{EM&4}0bVc)sroZ^c zvB*i@n%Kgxlq!M^ue>@t)cl^5`GHBO!-FVC%M=*as6RVs!+_+gce=~7)dQ+9hCOzd zQF}lnBxXO0C=JEvL#*VFzC!$Oa+FGXs70N3&0F1cW6Be6KC4hmjp(5XQ(BWzV6%H8 zpKb}I-$AMI>rU35}uV|er-G@V+%IJ(i`t&gJ?F5h04-AA$N zu~w#;66KS_v<8h%Tpi{gwfkB}Ze$|zU%HW^cFmqDC z$k2O_p56n$Au#*+o`6x8q$Ok}eY6J2gdW0scDsv+4Q(1RI(s8nkGCC=sEK?=l0!@C zX=BDryebrR(e9`#lTfq%EdF!8sbRBg`|IbG-Axns6q9+(xz{zm6AHdR?;*T|4R;_? zgN!gE#SDYlID%+(c8dMt6G88@?nJz}>PgTgQB$Fe4 zetrbIAUlzM?NAIWipHJp!Udnwa2E0@0{vpFLym><0Cg(@#Wl^M{8dXWX;VW ztrfm|`yanO{-hdpjcbn%+5@y9(EE6u#bT5w30Z+J7`UJq2|wyCln#c>Yt#P5xrKud zGA5hNRms0kBu%I4c;x+@oGZ|F<{-IX%T7P89VluFGnf#idHc!c6XfC7R5uIdA?@Ra zmpb*QT9oWG zgHChBS?!57*huO3%9@{*t;JM7{Y>X>E^)dk$48${=uRNH!xb@qekJPMjS|fJj|uJG zIp%`{@efbU;V-|4L%Ne_r7j3_`>}K%t1gX8tBhE^9%+$)F1aF~oO&|s%d;zjX`O-* zvnr*^5jS5pMUd* zsz3agEgr8zZ-@Jj^mY^J6bX%WEl|JerucMAu|kY4CfI?`pd`UHVsmiyv_DH0-_or{ zMwt@QACU_bo#bS83&l{_iegnyw#TReUKr*p(saQ}>N>|O zLM8sAWh;yeV%JMg&=;ij1c>Ii+J0c2qmx+AN<6o`OxU^VoxMkQ?16L<`&h1`kX7pN zD^Ka9@1qZ}h!E=T!FdlLixAq!cC>7)BBeAqfHNA8eW#@~5m76+7x8UlMC8g3;qMl) z3C6zAfTjVHf!ya!R8c`K?*<)nu6&K0x3=kV@8Hoi*}Y@uo?ks^nNn1G zURh8?f9-<>B}qedi12)Y&X1=X*DrgmA1EFbOWmWjdjL(8&^~r3!15N(^goaZX?JlaF-dCBzeeX)tDik_P*sc1o7h%eRo(l)1IdmV_}R1?Ke5tIo)h}+b$ZTg;(*36L}+O7obw~=WyY&XdhLw3_ns1B zOrf(SB*VhRxl1IQY%^ZhM%Yg(tFj*xN8}#)ahWo?^OSYmtb2Y?FpPkpne zb#}4*zWldI>^zfgXX*$C4y)xQjzvuNc}p`mGEDtc?=$e{Gjgc8>#d^J-q;(Xz@p1j z5?A7U)UG(9pO&JH{c?b}A(MU2S^U{i-dmwfI(@+;$6Lax?d=u|oK>!qUZm2Q~mt`fr{wSPZvaZqVqM8~>)moO2hp2isDF=eB#H3Mx!>MvY>804FpzCBXw+x+r zEcSchSdRd;RvzCSLa+40MZsxr^;?KrT3cL1n`K9z3D;5Fholfvd7BNGhf0yvF~=3c zH18R=v>}AO3s`k-F3YL@d=f`1L)9SlV9x5WzM61Ja22Z5RXOU(8~w3F{@YG1M~^Ow zCDp{SS9VoMcQ|sy`1F2KEB$JHCyG})~te?nC4} zj#0i^4pCySy1ra4)t)?;t=`(=K|k0__VLKkd&fTv@i2QZ5Idf#Ragr9? zrTNZcc_dNT_|iCA^f#OILBFmi7Y0_2sbrnr_e)N?|jeBK7GA8Vw@fli^U5{-TL@E3C~zdcdGYmP^U358N2-L#?_&HsA{VDFyR~{pF4;A--RY>H%@IB7q5CU8R`$DfcjC@RkDkN-Hu~s~4z7i;ZM?t3|FYs9k zGq2LD_tZ$TBjb;%Y$N{1cN@y8UJQRJtd0H7te;Fl{7K*ZBIn^73j&9Q%d)w&jp>-n zdaEeKJ5;GqR0J8Z1sSbm{4_o6q%`FOir+a8BrIq?v(ZZGYEixMJd3{jhoM2xcb-G6 zqzNqb)2hL$0tk9EsTq}=s)qM~8qq~-%ClB*`2fQgl=2Ym5wY+5D@-jH-DaQ1;>)m& z{U?||)8x0FggaWR4;;O$kWrjF)FS4jMJDI3Yf_NqZx*AK__wtXe|oiF(db(t<}fJ} zSBqacc{ynXo%4JO7Iy9ycpEbla0TJwikG%FaRFWon6k-s9*My~U|QcXA#07A`OD2HEA^;|6At(ZYM1oP_ zpa2yUyo=3N9JwGI4t6*2=XcH!0s~c32g2KdU~zEr2V&1<9e~(xF!=qSp!*#X1SVr+ zX=wq3u7k1a7oaF$)$i(LVg`iK11QPM!o?8^b94Z-#a{+;cxx96Aa4nTwgdm#xC41t zKoyv)g}a3V6lP&#WsQe|7)TN#3}6Elps7$WVoD2&1Bh-8<`yolW{xfvP;iS1Fr_IJ zW(I_`1FsOs6$%3m0a!y}HUJ5DehCV*gTm|~Fn}{4+X*lNFxvuxT>(V^Zcvyz5b6%} zfd>#R2gJ3*bZyM>)jgwD1=8=fI4L ziN{uZK=WY!Jb$nQXu)#?LMZhAn&42RFm@~LlmhQ}@_$?*f8_Ui{+srXydBOc3-cFtW;|c+v{Vosi0QqbsfWBoDP#Bnm zfmZ`eTlc{63}K*|;5}Fw0|ZxsbdVVqP++SpXa=y}cDdj6wt3)yZGtTt$RFUhqXck; zpoM@20ldJ*{8k7TgVn(V7T8J{_?B8* ztbW%56#;Ly3^$3%yK~*41J}QK0fqq^`A3;w`JGM%uKxrH z|B3P?fzw3bybsR5z*2WBrwY85zj9j8t=LiEsRDfZGOZ|Kojj=4;UR$^q{Dm!yB5gL>>ZCrQMgF8cS0Hkh;|PzVNq4+O{< zCW-uCivfP;pJJ&0wHRFRUj@A(Q{6dn2ONdlI(oN7 z1>~rk*f`*KZnSOe03`D34g%XoJD>r9FSfv60ORNZKF|rw!}PavkUx&ofo9x7Hx|HY zxc#5r1N`YWY6ANQ4j2+he;vB}9kjlmRYoU*@+ko7dv0hsT_YuIS$uWZd$Gq|F@^uJ z-x;nX0pCo{x*#Ge|5!VAbLA?_0Ee=m2JWc`4NzNpw99rx!n@c-_00w#g~I*iI0g#dR+4z2kT zo&0n-V4-XTJvc+9$ndC|YrK0R^r#fSXdcg;r)12%ipyVZrI)bl<>!MLtOk1rLZ-M@ ztj4qGYmC0gCBtHC4_DMDS9Hjfvb~U+MV{v`JzUYF?e&%oOXlBqsHoiDt2WgbYk#1h zGgKybot?!4`njHJp_F5fy<>2u-kISP08hN_wG;rzwe3a}0Pyq6kNh*A|KW=NA%c!% zo7F$Wy=GAG1T;*^#vGj60PhdFlC8tQZooofYx3VdKLh~p-yC7mz=2+Hy~zVia!z)R zcsm3H?n2SEt|)>aa~iV1$4VK)YyYIIsrZO+$cdJn;8NT@2{0chQ8v_3>^Rh;{x& z18#sd>uwqf?C*bdJh6YjDzi9{@h>8A9LkR)P zj(^jDE&!&(-82k{i2hXWxI8^^vIV^BuZdYAlI1qHSn}z^D z|85!v#L(dH_gI4i>w>>%Sa2I-S2-;5?{S620PEx3<**>?27mv^z=44)yJ=`(L9v^L z{ky$59EfMZ-*0|^cECo=Zko_vauI-;|0)OUvh3Cccmw~gi$MR?=fE@p;{Jc@f=1$Y zw*!F|{%cGkFyIH_U3D?S@IPpH7ZV#h3zx0`O+w4Y%L15lpfF8GM{u&-0`ZCtmX5%> dY3l}