From da3cfe25aee16f500e431a5bfe46a3445dd6c633 Mon Sep 17 00:00:00 2001 From: Jack Lo Date: Fri, 21 Feb 2025 11:14:28 -0700 Subject: [PATCH 1/5] Refactor host test code into Makefile and xrt_test_wrapper.h --- .../basic/vector_scalar_mul/CMakeLists.txt | 8 +- .../basic/vector_scalar_mul/Makefile | 12 +- .../basic/vector_scalar_mul/test.cpp | 163 +++++--------- .../vector_scalar_mul_alt.py | 26 ++- runtime_lib/test_lib/xrt_test_wrapper.h | 211 ++++++++++++++++++ 5 files changed, 296 insertions(+), 124 deletions(-) create mode 100644 runtime_lib/test_lib/xrt_test_wrapper.h diff --git a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt index 69edfb687b..decc051d16 100644 --- a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt +++ b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt @@ -30,7 +30,9 @@ else() set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() -set(VECTORSCALARMUL_SIZE 4096 CACHE STRING "vector size") +set(IN1_SIZE 8192 CACHE STRING "in1 buffer size") +set(IN2_SIZE 4 CACHE STRING "in2 buffer size") +set(OUT_SIZE 8192 CACHE STRING "out buffer size") set(TARGET_NAME test CACHE STRING "Target to be built") SET (ProjectName ${TARGET_NAME}) @@ -51,7 +53,9 @@ add_executable(${currentTarget} ) target_compile_definitions(${currentTarget} PUBLIC - VECTORSCALARMUL_SIZE=${VECTORSCALARMUL_SIZE} + IN1_SIZE=${IN1_SIZE} + IN2_SIZE=${IN2_SIZE} + OUT_SIZE=${OUT_SIZE} DISABLE_ABI_CHECK=1 ) diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index 98bf125c3c..b643c8d56a 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -16,10 +16,14 @@ VPATH := ${srcdir}/../../../aie_kernels/aie2 device = npu targetname = vector_scalar_mul -data_size = 4096 +# in1_size = 4096 +in1_size = 8192 # in bytes +in2_size = 4 # in bytes, should always be 4 (1x int32) +out_size = 8192 # in bytes, should always be equal to in1_size trace_size = 8192 CHESS ?= true +data_size = in1_size aie_py_src=${targetname}.py use_alt?=0 @@ -47,11 +51,11 @@ endif build/aie_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${device} ${data_size} 0 > $@ + python3 $< ${device} ${in1_size} ${in2_size} ${out_size} 0 > $@ build/aie_trace_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${device} ${data_size} ${trace_size} > $@ + python3 $< ${device} ${in1_size} ${in2_size} ${out_size} ${trace_size} > $@ #build/insts_${data_size}.txt: build/final_${data_size}.xclbin build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o @@ -79,7 +83,7 @@ endif ${targetname}_${data_size}.exe: ${srcdir}/test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}_${data_size} -DVECTORSCALARMUL_SIZE=${data_size} + cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}_${data_size} -DIN1_SIZE=${in1_size} -DIN2_SIZE=${in2_size} -DOUT_SIZE=${out_size} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}_${data_size}.exe $@ diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp index d4acb04292..f15a3c0ec8 100644 --- a/programming_examples/basic/vector_scalar_mul/test.cpp +++ b/programming_examples/basic/vector_scalar_mul/test.cpp @@ -9,114 +9,50 @@ //===----------------------------------------------------------------------===// #include -#include -#include -#include +#include "xrt_test_wrapper.h" -#include "test_utils.h" -#include "xrt/xrt_bo.h" +//***************************************************************************** +// Modify this section to customize buffer datatypes, initialization functions, +// and verify function. The other place to reconfigure your design is the +// Makefile. +//***************************************************************************** #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED // ------------------------------------------------------ // Configure this to match your buffer data type // ------------------------------------------------------ -// using DATATYPE = std::uint8_t; -// using DATATYPE = std::uint32_t; -using DATATYPE = std::uint16_t; +using DATATYPE_IN1 = std::uint16_t; +using DATATYPE_IN2 = std::int32_t; +using DATATYPE_OUT = std::uint16_t; #endif -const int scaleFactor = 3; - -namespace po = boost::program_options; - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - po::variables_map vm; - test_utils::add_default_options(desc); - - test_utils::parse_options(argc, argv, desc, vm); - int verbosity = vm["verbosity"].as(); - int trace_size = vm["trace_sz"].as(); - - constexpr bool VERIFY = true; - constexpr int IN_VOLUME = VECTORSCALARMUL_SIZE; - constexpr int OUT_VOLUME = IN_VOLUME; - - int IN_SIZE = IN_VOLUME * sizeof(DATATYPE); - int OUT_SIZE = OUT_VOLUME * sizeof(DATATYPE) + trace_size; - - // Load instruction sequence - std::vector instr_v = - test_utils::load_instr_sequence(vm["instr"].as()); - - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT context and load the kernel - xrt::device device; - xrt::kernel kernel; - - test_utils::init_xrt_load_kernel(device, kernel, verbosity, - vm["xclbin"].as(), - vm["kernel"].as()); - - // set up the buffer objects - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_inA = - xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_outC = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - // Copy instruction stream to xrt buffer object - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Initialize buffer bo_inA - DATATYPE *bufInA = bo_inA.map(); - for (int i = 0; i < IN_VOLUME; i++) - bufInA[i] = i + 1; - - // Initialize buffer bo_inFactor - int32_t *bufInFactor = bo_inFactor.map(); - *bufInFactor = (DATATYPE)scaleFactor; - - // Zero out buffer bo_outC - DATATYPE *bufOut = bo_outC.map(); - memset(bufOut, 0, OUT_SIZE); - - // sync host to device memories - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE); +// Initialize Input buffer 1 +void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int SIZE) +{ + for(int i=0; i= 1) - std::cout << "Running Kernel.\n"; - unsigned int opcode = 3; - auto run = - kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); - run.wait(); +// Initialize Input buffer 2 +void initialize_bufIn2(DATATYPE_IN2 *bufIn2, int SIZE) +{ + bufIn2[0] = 3; // scaleFactor +} - // Sync device to host memories - bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE); +// Initialize Output buffer +void initialize_bufOut(DATATYPE_OUT *bufOut, int SIZE) +{ + memset(bufOut, 0, SIZE); +} - // Compare out to golden +// Functional correctness verifyer +int verify_vector_scalar_mul(DATATYPE_IN1 *bufIn1, DATATYPE_IN2 *bufIn2, DATATYPE_OUT *bufOut, int SIZE, int verbosity) +{ int errors = 0; - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - for (uint32_t i = 0; i < IN_VOLUME; i++) { - int32_t ref = bufInA[i] * scaleFactor; + + for (int i = 0; i < SIZE; i++) { + int32_t ref = bufIn1[i] * bufIn2[0]; int32_t test = bufOut[i]; if (test != ref) { if (verbosity >= 1) @@ -127,21 +63,28 @@ int main(int argc, const char *argv[]) { std::cout << "Correct output " << test << " == " << ref << std::endl; } } + return errors; +} - if (trace_size > 0) { - test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, - vm["trace_file"].as()); - } +//***************************************************************************** +// Should not need to modify below section +//***************************************************************************** - // Print Pass/Fail result of our test - if (!errors) { - std::cout << std::endl << "PASS!" << std::endl << std::endl; - return 0; - } else { - std::cout << std::endl - << errors << " mismatches." << std::endl - << std::endl; - std::cout << std::endl << "fail." << std::endl << std::endl; - return 1; - } +int main(int argc, const char *argv[]) { + + // constexpr int IN1_VOLUME = VECTORSCALARMUL_SIZE; // 1024; define via Makefile + // constexpr int IN2_VOLUME = 1; + // constexpr int OUT_VOLUME = IN1_VOLUME; // define via Makefile + + constexpr int IN1_VOLUME = IN1_SIZE / sizeof(DATATYPE_IN1); + constexpr int IN2_VOLUME = IN2_SIZE / sizeof(DATATYPE_IN2); + constexpr int OUT_VOLUME = OUT_SIZE / sizeof(DATATYPE_OUT); + + args myargs = parse_args(argc, argv); + + int res = xrt_test_run + (IN1_VOLUME, IN2_VOLUME, OUT_VOLUME, myargs); + return res; } diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py index b74142621e..19593950f6 100644 --- a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py +++ b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py @@ -16,11 +16,16 @@ import aie.utils.trace as trace_utils -def my_vector_scalar(dev, vector_size, trace_size): - N = vector_size - N_in_bytes = N * 2 +def my_vector_scalar(dev, in1_size, in2_size, out_size, trace_size): + # N = vector_size + # N_in_bytes = N * 2 # TODO How to force this to match data type + N_in_bytes = in1_size # TODO How to force this to match data type + N = N_in_bytes // 2 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n + + assert in2_size == 4, "2nd input buffer must be size 4 (4 bytes = 1 integer)." + assert out_size == in1_size, "Output buffer size must match input buffer size." buffer_depth = 2 @@ -97,6 +102,9 @@ def sequence(A, F, C): try: + if (len(sys.argv) < 5): + raise ValueError("[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)") + device_name = str(sys.argv[1]) if device_name == "npu": dev = AIEDevice.npu1_1col @@ -104,13 +112,15 @@ def sequence(A, F, C): dev = AIEDevice.npu2 else: raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - vector_size = int(sys.argv[2]) - if vector_size % 64 != 0 or vector_size < 512: - print("Vector size must be a multiple of 64 and greater than or equal to 512") + in1_size = int(sys.argv[2]) + if in1_size % 128 != 0 or in1_size < 1024: + print("In1 buffer size must be a multiple of 128 (so len is multiple of 64) and greater than or equal to 1024 (so len >= 512)") raise ValueError - trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) + in2_size = int(sys.argv[3]) + out_size = int(sys.argv[4]) + trace_size = 0 if (len(sys.argv) != 6) else int(sys.argv[5]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - my_vector_scalar(dev, vector_size, trace_size) + my_vector_scalar(dev, in1_size, in2_size, out_size, trace_size) print(ctx.module) diff --git a/runtime_lib/test_lib/xrt_test_wrapper.h b/runtime_lib/test_lib/xrt_test_wrapper.h new file mode 100644 index 0000000000..0668d7a99b --- /dev/null +++ b/runtime_lib/test_lib/xrt_test_wrapper.h @@ -0,0 +1,211 @@ + +#include +#include +#include + +#include "test_utils.h" +#include "xrt/xrt_bo.h" + +namespace po = boost::program_options; + + +struct args { + int verbosity; + int do_verify; + int n_iterations; + int n_warmup_iterations; + int trace_size; + std::string instr; + std::string xclbin; + std::string kernel; + std::string trace_file; +}; + +struct args parse_args(int argc, const char *argv[]) { + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + struct args myargs; + + test_utils::parse_options(argc, argv, desc, vm); + myargs.verbosity = vm["verbosity"].as(); + myargs.do_verify = vm["verify"].as(); + myargs.n_iterations = vm["iters"].as(); + myargs.n_warmup_iterations = vm["warmup"].as(); + myargs.trace_size = vm["trace_sz"].as(); + myargs.instr = vm["instr"].as(); + myargs.xclbin = vm["xclbin"].as(); + myargs.kernel = vm["kernel"].as(); + myargs.trace_file = vm["trace_file"].as(); + + return myargs; +} + + + +template +int xrt_test_run( + int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, + struct args myargs) +{ + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(myargs.instr); + if (myargs.verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT context and load the kernel + xrt::device device; + xrt::kernel kernel; + + test_utils::init_xrt_load_kernel(device, kernel, myargs.verbosity, + myargs.xclbin, + myargs.kernel); + + // set up the buffer objects + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in1 = xrt::bo(device, + IN1_VOLUME*sizeof(T1), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_in2 = xrt::bo(device, + IN2_VOLUME*sizeof(T2), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, + OUT_VOLUME*sizeof(T3)+myargs.trace_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + if (myargs.verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize buffer objects + T1 *bufIn1 = bo_in1.map(); + T2 *bufIn2 = bo_in2.map(); + T3 *bufOut = bo_out.map(); + + init_bufIn1(bufIn1, IN1_VOLUME); + init_bufIn2(bufIn2, IN2_VOLUME); + init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it? + + // sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (myargs.verbosity >= 1) + std::cout << "Running Kernel.\n"; + + // Run kernel + if (myargs.verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < myargs.n_warmup_iterations) + /* Warmup iterations do not count towards average runtime. */ + continue; + + // Copy output results and verify they are correct + if (myargs.do_verify) { + if (myargs.verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + + errors += verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity); + + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (myargs.verbosity >= 1) + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } else { + if (myargs.verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 and first iteration + if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) { + test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME*sizeof(T3), myargs.trace_size, + myargs.trace_file); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / myargs.n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} + From 0dcb86f63e6f9b3da7cb0dbd8aab3bd73ae63f64 Mon Sep 17 00:00:00 2001 From: Jack Lo Date: Fri, 21 Feb 2025 11:28:27 -0700 Subject: [PATCH 2/5] format fix and fixed non alt version --- .../basic/vector_scalar_mul/Makefile | 2 +- .../basic/vector_scalar_mul/test.cpp | 31 +- .../vector_scalar_mul/vector_scalar_mul.py | 24 +- .../vector_scalar_mul_alt.py | 14 +- runtime_lib/test_lib/xrt_test_wrapper.h | 366 +++++++++--------- 5 files changed, 220 insertions(+), 217 deletions(-) diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index b643c8d56a..b6e0037c2a 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -21,7 +21,7 @@ in1_size = 8192 # in bytes in2_size = 4 # in bytes, should always be 4 (1x int32) out_size = 8192 # in bytes, should always be equal to in1_size trace_size = 8192 -CHESS ?= true +CHESS ?= false data_size = in1_size aie_py_src=${targetname}.py diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp index f15a3c0ec8..f7a7c70bba 100644 --- a/programming_examples/basic/vector_scalar_mul/test.cpp +++ b/programming_examples/basic/vector_scalar_mul/test.cpp @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#include #include "xrt_test_wrapper.h" +#include //***************************************************************************** // Modify this section to customize buffer datatypes, initialization functions, @@ -28,27 +28,24 @@ using DATATYPE_OUT = std::uint16_t; #endif // Initialize Input buffer 1 -void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int SIZE) -{ - for(int i=0; i - (IN1_VOLUME, IN2_VOLUME, OUT_VOLUME, myargs); + initialize_bufIn1, initialize_bufIn2, + initialize_bufOut, verify_vector_scalar_mul>( + IN1_VOLUME, IN2_VOLUME, OUT_VOLUME, myargs); return res; } diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py index 8e00df6380..7c9180850e 100644 --- a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py +++ b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py @@ -14,10 +14,11 @@ from aie.iron.controlflow import range_ -def my_vector_scalar(dev, vector_size, trace_size): +def my_vector_scalar(dev, in1_size, in2_size, out_size, trace_size): if trace_size != 0: raise NotImplementedError("Trace not supported yet.") - N = vector_size + N_in_bytes = in1_size + N = N_in_bytes // 2 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n vectorized = True @@ -71,6 +72,11 @@ def core_body(of_in, of_factor, of_out, scale_fn): try: + if len(sys.argv) < 5: + raise ValueError( + "[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)" + ) + device_name = str(sys.argv[1]) if device_name == "npu": dev = NPU1Col1() @@ -78,12 +84,16 @@ def core_body(of_in, of_factor, of_out, scale_fn): dev = NPU2() else: raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - vector_size = int(sys.argv[2]) - if vector_size % 64 != 0 or vector_size < 512: - print("Vector size must be a multiple of 64 and greater than or equal to 512") + in1_size = int(sys.argv[2]) + if in1_size % 128 != 0 or in1_size < 1024: + print( + "In1 buffer size must be a multiple of 128 (so len is multiple of 64) and greater than or equal to 1024 (so len >= 512)" + ) raise ValueError - trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) + in2_size = int(sys.argv[3]) + out_size = int(sys.argv[4]) + trace_size = 0 if (len(sys.argv) != 6) else int(sys.argv[5]) except ValueError: print("Argument has inappropriate value") -module = my_vector_scalar(dev, vector_size, trace_size) +module = my_vector_scalar(dev, in1_size, in2_size, out_size, trace_size) print(module) diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py index 19593950f6..ac72a3285f 100644 --- a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py +++ b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_alt.py @@ -19,11 +19,11 @@ def my_vector_scalar(dev, in1_size, in2_size, out_size, trace_size): # N = vector_size # N_in_bytes = N * 2 # TODO How to force this to match data type - N_in_bytes = in1_size # TODO How to force this to match data type + N_in_bytes = in1_size # TODO How to force this to match data type N = N_in_bytes // 2 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n - + assert in2_size == 4, "2nd input buffer must be size 4 (4 bytes = 1 integer)." assert out_size == in1_size, "Output buffer size must match input buffer size." @@ -102,8 +102,10 @@ def sequence(A, F, C): try: - if (len(sys.argv) < 5): - raise ValueError("[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)") + if len(sys.argv) < 5: + raise ValueError( + "[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)" + ) device_name = str(sys.argv[1]) if device_name == "npu": @@ -114,7 +116,9 @@ def sequence(A, F, C): raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) in1_size = int(sys.argv[2]) if in1_size % 128 != 0 or in1_size < 1024: - print("In1 buffer size must be a multiple of 128 (so len is multiple of 64) and greater than or equal to 1024 (so len >= 512)") + print( + "In1 buffer size must be a multiple of 128 (so len is multiple of 64) and greater than or equal to 1024 (so len >= 512)" + ) raise ValueError in2_size = int(sys.argv[3]) out_size = int(sys.argv[4]) diff --git a/runtime_lib/test_lib/xrt_test_wrapper.h b/runtime_lib/test_lib/xrt_test_wrapper.h index 0668d7a99b..bc045c330e 100644 --- a/runtime_lib/test_lib/xrt_test_wrapper.h +++ b/runtime_lib/test_lib/xrt_test_wrapper.h @@ -8,204 +8,196 @@ namespace po = boost::program_options; - struct args { - int verbosity; - int do_verify; - int n_iterations; - int n_warmup_iterations; - int trace_size; - std::string instr; - std::string xclbin; - std::string kernel; - std::string trace_file; + int verbosity; + int do_verify; + int n_iterations; + int n_warmup_iterations; + int trace_size; + std::string instr; + std::string xclbin; + std::string kernel; + std::string trace_file; }; struct args parse_args(int argc, const char *argv[]) { - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ - po::options_description desc("Allowed options"); - po::variables_map vm; - test_utils::add_default_options(desc); - - struct args myargs; - - test_utils::parse_options(argc, argv, desc, vm); - myargs.verbosity = vm["verbosity"].as(); - myargs.do_verify = vm["verify"].as(); - myargs.n_iterations = vm["iters"].as(); - myargs.n_warmup_iterations = vm["warmup"].as(); - myargs.trace_size = vm["trace_sz"].as(); - myargs.instr = vm["instr"].as(); - myargs.xclbin = vm["xclbin"].as(); - myargs.kernel = vm["kernel"].as(); - myargs.trace_file = vm["trace_file"].as(); - - return myargs; + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + struct args myargs; + + test_utils::parse_options(argc, argv, desc, vm); + myargs.verbosity = vm["verbosity"].as(); + myargs.do_verify = vm["verify"].as(); + myargs.n_iterations = vm["iters"].as(); + myargs.n_warmup_iterations = vm["warmup"].as(); + myargs.trace_size = vm["trace_sz"].as(); + myargs.instr = vm["instr"].as(); + myargs.xclbin = vm["xclbin"].as(); + myargs.kernel = vm["kernel"].as(); + myargs.trace_file = vm["trace_file"].as(); + + return myargs; } +template +int xrt_test_run(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, + struct args myargs) { + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = test_utils::load_instr_sequence(myargs.instr); + if (myargs.verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT context and load the kernel + xrt::device device; + xrt::kernel kernel; + + test_utils::init_xrt_load_kernel(device, kernel, myargs.verbosity, + myargs.xclbin, myargs.kernel); + + // set up the buffer objects + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_in2 = xrt::bo(device, IN2_VOLUME * sizeof(T2), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3) + myargs.trace_size, + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + if (myargs.verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize buffer objects + T1 *bufIn1 = bo_in1.map(); + T2 *bufIn2 = bo_in2.map(); + T3 *bufOut = bo_out.map(); + + init_bufIn1(bufIn1, IN1_VOLUME); + init_bufIn2(bufIn2, IN2_VOLUME); + init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it? + + // sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { - -template -int xrt_test_run( - int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, - struct args myargs) -{ - - srand(time(NULL)); - - // Load instruction sequence - std::vector instr_v = - test_utils::load_instr_sequence(myargs.instr); if (myargs.verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT context and load the kernel - xrt::device device; - xrt::kernel kernel; - - test_utils::init_xrt_load_kernel(device, kernel, myargs.verbosity, - myargs.xclbin, - myargs.kernel); - - // set up the buffer objects - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_in1 = xrt::bo(device, - IN1_VOLUME*sizeof(T1), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_in2 = xrt::bo(device, - IN2_VOLUME*sizeof(T2), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_out = xrt::bo(device, - OUT_VOLUME*sizeof(T3)+myargs.trace_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + std::cout << "Running Kernel.\n"; + // Run kernel if (myargs.verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - // Copy instruction stream to xrt buffer object - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Initialize buffer objects - T1 *bufIn1 = bo_in1.map(); - T2 *bufIn2 = bo_in2.map(); - T3 *bufOut = bo_out.map(); - - init_bufIn1(bufIn1, IN1_VOLUME); - init_bufIn2(bufIn2, IN2_VOLUME); - init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it? - - // sync host to device memories - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // ------------------------------------------------------ - // Initialize run configs - // ------------------------------------------------------ - unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - int errors = 0; - - // ------------------------------------------------------ - // Main run loop - // ------------------------------------------------------ - for (unsigned iter = 0; iter < num_iter; iter++) { - - if (myargs.verbosity >= 1) - std::cout << "Running Kernel.\n"; - - // Run kernel - if (myargs.verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto start = std::chrono::high_resolution_clock::now(); - unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - if (iter < myargs.n_warmup_iterations) - /* Warmup iterations do not count towards average runtime. */ - continue; - - // Copy output results and verify they are correct - if (myargs.do_verify) { - if (myargs.verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - auto vstart = std::chrono::system_clock::now(); - - errors += verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity); - - auto vstop = std::chrono::system_clock::now(); - float vtime = - std::chrono::duration_cast(vstop - vstart) - .count(); - if (myargs.verbosity >= 1) - std::cout << "Verify time: " << vtime << "secs." << std::endl; - } else { - if (myargs.verbosity >= 1) - std::cout << "WARNING: results not verified." << std::endl; - } - - // Write trace values if trace_size > 0 and first iteration - if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) { - test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME*sizeof(T3), myargs.trace_size, - myargs.trace_file); - } - - // Accumulate run times - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < myargs.n_warmup_iterations) + /* Warmup iterations do not count towards average runtime. */ + continue; + + // Copy output results and verify they are correct + if (myargs.do_verify) { + if (myargs.verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + + errors += + verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity); + + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (myargs.verbosity >= 1) + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } else { + if (myargs.verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; } - // ------------------------------------------------------ - // Print verification and timing results - // ------------------------------------------------------ - - // TODO - Mac count to guide gflops - float macs = 0; - - std::cout << std::endl - << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us." - << std::endl; - if (macs > 0) - std::cout << "Avg NPU gflops: " - << macs / (1000 * npu_time_total / myargs.n_iterations) << std::endl; - - std::cout << std::endl - << "Min NPU time: " << npu_time_min << "us." << std::endl; - if (macs > 0) - std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) - << std::endl; - - std::cout << std::endl - << "Max NPU time: " << npu_time_max << "us." << std::endl; - if (macs > 0) - std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) - << std::endl; - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; - return 1; + // Write trace values if trace_size > 0 and first iteration + if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) { + test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME * sizeof(T3), + myargs.trace_size, myargs.trace_file); } -} + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / myargs.n_iterations) + << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} From 5607ad69304eab28d6253d2bc1710b649d42da17 Mon Sep 17 00:00:00 2001 From: Jack Lo Date: Fri, 21 Feb 2025 15:17:29 -0700 Subject: [PATCH 3/5] Fixed python wrappers --- .../basic/vector_scalar_mul/Makefile | 5 +- .../basic/vector_scalar_mul/test.py | 89 +++++++------------ python/utils/test.py | 21 +++++ python/utils/xrt.py | 57 +++++++++++- 4 files changed, 112 insertions(+), 60 deletions(-) diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index a251a8fbba..5c74c37b33 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -103,7 +103,7 @@ run: ${targetname}_${data_size}.exe build/final_${data_size}.xclbin build/insts_ ${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} python3 ${srcdir}/test.py -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -s ${data_size} + ${powershell} python3 ${srcdir}/test.py -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -i1s ${in1_size} -i2s ${in2_size} -os ${out_size} trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} @@ -111,7 +111,8 @@ trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin buil ${srcdir}/../../utils/get_trace_summary.py --filename trace_vs.json trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size} + #${powershell} python3 ${srcdir}/test_orig.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size} --size 4096 + ${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size} ${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json ${srcdir}/../../utils/get_trace_summary.py --filename trace_vs.json diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index c91b53307f..c7f324db1c 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -7,79 +7,54 @@ # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates import numpy as np import sys -import time -from aie.utils.xrt import setup_aie, write_out_trace, execute +import aie.utils.xrt as xrt_utils import aie.utils.test as test_utils def main(opts): - print("Running...\n") + in1_size = int(opts.in1_size) # in bytes + in2_size = int(opts.in2_size) # in bytes + out_size = int(opts.out_size) # in bytes - data_size = int(opts.size) - vector_dtype = np.int16 - scalar_dtype = np.int32 - scale_factor = 3 - size_out = data_size * 2 - print("output buffer size: " + str(size_out)) + print(str(in1_size) + ", " + str(in2_size) + ", " + str(out_size)) + + #----- Edit your data types ----------------------------------------------- - enable_trace = opts.trace_size > 0 + in1_dtype = np.int16 + in2_dtype = np.int32 + out_dtype = in1_dtype + + #-------------------------------------------------------------------------- - app = setup_aie( - opts.xclbin, - opts.instr, - data_size, - vector_dtype, - 1, - scalar_dtype, - data_size, - vector_dtype, - enable_trace=enable_trace, - trace_size=opts.trace_size, - ) - input_vector = np.arange(1, data_size + 1, dtype=vector_dtype) - input_factor = np.array([3], dtype=scalar_dtype) - # aie_output = execute_on_aie(app, input_vector, input_factor) + in1_volume = in1_size // np.dtype(in1_dtype).itemsize + in2_volume = in2_size // np.dtype(in2_dtype).itemsize + out_volume = out_size // np.dtype(out_dtype).itemsize - start = time.time_ns() - full_output = execute(app, input_vector, input_factor) - stop = time.time_ns() - npu_time = stop - start - print("npu_time: ", npu_time) + #----- Edit your data init, and reference data here ----------------------- - # aie_output = full_output[:size_out].view(np.int8) - # aie_output = full_output[:size_out].view(np.uint8) - aie_output = full_output[:size_out].view(np.int16) - if enable_trace: - trace_buffer = full_output[size_out:].view(np.uint32) + # check buffer sizes + assert (in2_size == 4) + assert (out_size == in1_size) - ref = np.arange(1, data_size + 1, dtype=vector_dtype) * scale_factor + scale_factor = 3 + + # Initialize data + in1_data = np.arange(1, in1_volume + 1, dtype=in1_dtype) + in2_data = np.array([scale_factor], dtype=in2_dtype) + out_data = np.zeros([out_volume], dtype=out_dtype) - if enable_trace: - # trace_buffer = full_output[3920:] - print("trace_buffer shape: ", trace_buffer.shape) - print("trace_buffer dtype: ", trace_buffer.dtype) - # write_out_trace(trace_buffer, str(opts.trace_file)) - write_out_trace(trace_buffer, "trace.txt") + # Define reference data + ref = np.arange(1, in1_volume + 1, dtype=out_dtype) * scale_factor - # Copy output results and verify they are correct - errors = 0 - if opts.verify: - if opts.verbosity >= 1: - print("Verifying results ...") - e = np.equal(ref, aie_output) - errors = np.size(e) - np.count_nonzero(e) + #-------------------------------------------------------------------------- - if not errors: - print("\nPASS!\n") - sys.exit(0) - else: - print("\nError count: ", errors) - print("\nFailed.\n") - sys.exit(1) + print("Running...\n") + res = xrt_utils.xrt_test_run(in1_dtype, in2_dtype, out_dtype, in1_data, in2_data, out_data, + in1_volume, in2_volume, out_volume, ref, opts) + sys.exit(res) if __name__ == "__main__": p = test_utils.create_default_argparser() - p.add_argument("-s", "--size", required=True, dest="size", help="Vector size") opts = p.parse_args(sys.argv[1:]) main(opts) diff --git a/python/utils/test.py b/python/utils/test.py index d99584cd7d..d21c320b54 100644 --- a/python/utils/test.py +++ b/python/utils/test.py @@ -68,6 +68,27 @@ def create_default_argparser(): default="trace.txt", help="where to store trace output", ) + p.add_argument( + "-i1s", + "--in1_size", + dest="in1_size", + default=0, + help="Input 1 buffer size in bytes", + ) + p.add_argument( + "-i2s", + "--in2_size", + dest="in2_size", + default=0, + help="Input 2 buffer size in bytes", + ) + p.add_argument( + "-os", + "--out_size", + dest="out_size", + default=0, + help="Output buffer size in bytes", + ) return p diff --git a/python/utils/xrt.py b/python/utils/xrt.py index bd4fc1e3a8..2e5ec42856 100644 --- a/python/utils/xrt.py +++ b/python/utils/xrt.py @@ -6,10 +6,10 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. import copy +import time import numpy as np import pyxrt as xrt - class AIE_Application: def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"): @@ -167,3 +167,58 @@ def execute(app, input_one=None, input_two=None): app.buffers[4].write(input_two) app.run() return app.buffers[5].read() + +def xrt_test_run(in1_dtype, in2_dtype, out_dtype, + in1_data, in2_data, out_data, + in1_volume, in2_volume, out_volume, + ref, opts): + enable_trace = opts.trace_size > 0 + + app = setup_aie( + opts.xclbin, + opts.instr, + in1_volume, + in1_dtype, + in2_volume, + in2_dtype, + out_volume, + out_dtype, + enable_trace=enable_trace, + trace_size=opts.trace_size, + ) + + out_size = out_volume * out_data.itemsize + # print("out_size: " + str(out_size)) + + start = time.time_ns() + full_output = execute(app, in1_data, in2_data) + stop = time.time_ns() + npu_time = stop - start + print("npu_time: ", npu_time) + + aie_output = full_output[:out_size].view(out_dtype) + if enable_trace: + trace_buffer = full_output[out_size:].view(np.uint32) + + + if enable_trace: + if opts.verbosity >= 1: + print("trace_buffer shape: ", trace_buffer.shape) + print("trace_buffer dtype: ", trace_buffer.dtype) + write_out_trace(trace_buffer, str(opts.trace_file)) + + # Copy output results and verify they are correct + errors = 0 + if opts.verify: + if opts.verbosity >= 1: + print("Verifying results ...") + e = np.equal(ref, aie_output) + errors = np.size(e) - np.count_nonzero(e) + + if not errors: + print("\nPASS!\n") + return(0) + else: + print("\nError count: ", errors) + print("\nFailed.\n") + return(1) From e750f3a0e18ac011d20885b7c37c57135e3fb765 Mon Sep 17 00:00:00 2001 From: Jack Lo Date: Fri, 21 Feb 2025 15:19:01 -0700 Subject: [PATCH 4/5] format fix --- .../basic/vector_scalar_mul/test.py | 35 ++++++++++++------- python/utils/xrt.py | 24 +++++++++---- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index c7f324db1c..413ecefc4a 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -12,29 +12,29 @@ def main(opts): - in1_size = int(opts.in1_size) # in bytes - in2_size = int(opts.in2_size) # in bytes - out_size = int(opts.out_size) # in bytes + in1_size = int(opts.in1_size) # in bytes + in2_size = int(opts.in2_size) # in bytes + out_size = int(opts.out_size) # in bytes print(str(in1_size) + ", " + str(in2_size) + ", " + str(out_size)) - #----- Edit your data types ----------------------------------------------- + # ----- Edit your data types ----------------------------------------------- in1_dtype = np.int16 in2_dtype = np.int32 out_dtype = in1_dtype - - #-------------------------------------------------------------------------- + + # -------------------------------------------------------------------------- in1_volume = in1_size // np.dtype(in1_dtype).itemsize in2_volume = in2_size // np.dtype(in2_dtype).itemsize out_volume = out_size // np.dtype(out_dtype).itemsize - #----- Edit your data init, and reference data here ----------------------- + # ----- Edit your data init, and reference data here ----------------------- # check buffer sizes - assert (in2_size == 4) - assert (out_size == in1_size) + assert in2_size == 4 + assert out_size == in1_size scale_factor = 3 @@ -46,11 +46,22 @@ def main(opts): # Define reference data ref = np.arange(1, in1_volume + 1, dtype=out_dtype) * scale_factor - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- print("Running...\n") - res = xrt_utils.xrt_test_run(in1_dtype, in2_dtype, out_dtype, in1_data, in2_data, out_data, - in1_volume, in2_volume, out_volume, ref, opts) + res = xrt_utils.xrt_test_run( + in1_dtype, + in2_dtype, + out_dtype, + in1_data, + in2_data, + out_data, + in1_volume, + in2_volume, + out_volume, + ref, + opts, + ) sys.exit(res) diff --git a/python/utils/xrt.py b/python/utils/xrt.py index 2e5ec42856..770202b6a2 100644 --- a/python/utils/xrt.py +++ b/python/utils/xrt.py @@ -10,6 +10,7 @@ import numpy as np import pyxrt as xrt + class AIE_Application: def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"): @@ -168,10 +169,20 @@ def execute(app, input_one=None, input_two=None): app.run() return app.buffers[5].read() -def xrt_test_run(in1_dtype, in2_dtype, out_dtype, - in1_data, in2_data, out_data, - in1_volume, in2_volume, out_volume, - ref, opts): + +def xrt_test_run( + in1_dtype, + in2_dtype, + out_dtype, + in1_data, + in2_data, + out_data, + in1_volume, + in2_volume, + out_volume, + ref, + opts, +): enable_trace = opts.trace_size > 0 app = setup_aie( @@ -200,7 +211,6 @@ def xrt_test_run(in1_dtype, in2_dtype, out_dtype, if enable_trace: trace_buffer = full_output[out_size:].view(np.uint32) - if enable_trace: if opts.verbosity >= 1: print("trace_buffer shape: ", trace_buffer.shape) @@ -217,8 +227,8 @@ def xrt_test_run(in1_dtype, in2_dtype, out_dtype, if not errors: print("\nPASS!\n") - return(0) + return 0 else: print("\nError count: ", errors) print("\nFailed.\n") - return(1) + return 1 From a8ccb449654c2c3833e52ac6af5db9d3438dbc93 Mon Sep 17 00:00:00 2001 From: Jack Lo Date: Fri, 21 Feb 2025 15:46:41 -0700 Subject: [PATCH 5/5] Fixed typos --- programming_examples/basic/vector_scalar_mul/Makefile | 8 ++++---- programming_examples/basic/vector_scalar_mul/test.py | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index 5c74c37b33..79b935324c 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -107,14 +107,14 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} - ${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json - ${srcdir}/../../utils/get_trace_summary.py --filename trace_vs.json + ${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vector_scalar_mul.json + ${srcdir}/../../utils/get_trace_summary.py --filename trace_vector_scalar_mul.json trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt #${powershell} python3 ${srcdir}/test_orig.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size} --size 4096 ${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size} - ${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json - ${srcdir}/../../utils/get_trace_summary.py --filename trace_vs.json + ${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vector_scalar_mul.json + ${srcdir}/../../utils/get_trace_summary.py --filename trace_vector_scalar_mul.json clean_trace: diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index 413ecefc4a..be9a0b1c47 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -16,9 +16,9 @@ def main(opts): in2_size = int(opts.in2_size) # in bytes out_size = int(opts.out_size) # in bytes - print(str(in1_size) + ", " + str(in2_size) + ", " + str(out_size)) - + # -------------------------------------------------------------------------- # ----- Edit your data types ----------------------------------------------- + # -------------------------------------------------------------------------- in1_dtype = np.int16 in2_dtype = np.int32 @@ -30,7 +30,9 @@ def main(opts): in2_volume = in2_size // np.dtype(in2_dtype).itemsize out_volume = out_size // np.dtype(out_dtype).itemsize - # ----- Edit your data init, and reference data here ----------------------- + # -------------------------------------------------------------------------- + # ----- Edit your data init and reference data here ------------------------ + # -------------------------------------------------------------------------- # check buffer sizes assert in2_size == 4