Trying to integrate OpenMP target offload for AMD and NVIDIA

UO-OACISS · Jul 15, 2021 · 4794748 · 4794748
1 parent 64c4f4c
commit 4794748
Show file tree

Hide file tree

Showing 3 changed files with 367 additions and 0 deletions.
diff --git a/src/openmp/ompt_target_daxpy.c b/src/openmp/ompt_target_daxpy.c
@@ -0,0 +1,68 @@
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+void daxpy( double * __restrict__ a, double * __restrict__ b,
+        double scalar, int num_elements ) {
+
+#pragma omp target teams distribute parallel for simd map(tofrom:a[0:num_elements]) map(to:b[0:num_elements])
+      for (size_t j=0; j<num_elements; j++) {
+    a[j] = a[j] + b[j] * scalar;
+      }
+
+      return;
+}
+
+int main( int argc, char** argv )
+{
+  double*   a = NULL;
+  double*   b = NULL;
+  double*   c = NULL;
+  double scalar = 8.0;
+  int num_errors = 0;
+  int num_elements = 1024;
+
+  a = (double *) malloc( sizeof(double)*num_elements );
+  b = (double *) malloc( sizeof(double)*num_elements );
+  c = (double *) malloc( sizeof(double)*num_elements );
+
+  // initialize on the host
+  for (size_t j=0; j<num_elements; j++) {
+      a[j] = 0.0;
+      c[j] = 0.0;
+      b[j] = j;
+    }
+
+#pragma omp target enter data map(to:a[0:num_elements])
+#pragma omp target enter data map(to:b[0:num_elements])
+#pragma omp target enter data map(to:c[0:num_elements])
+
+  daxpy( a, b, scalar, num_elements );
+
+  daxpy( c, a, scalar, num_elements );
+
+#pragma omp target update from(c[0:num_elements])
+
+  // error checking
+  for (size_t j=0; j<num_elements; j++) {
+      if( fabs(c[j] - (double)j*scalar*scalar) > 0.000001  ) {
+      num_errors++;
+    }
+    }
+
+#pragma omp target exit data map(release:c[0:num_elements])
+#pragma omp target exit data map(release:a[0:num_elements])
+#pragma omp target exit data map(release:b[0:num_elements])
+
+  free(a);
+  free(b);
+  free(c);
+
+  if(num_errors == 0) printf( "Success!\n" );
+
+  assert(num_errors == 0);
+
+  return 0;
+}
diff --git a/src/openmp/ompt_target_matmult.c b/src/openmp/ompt_target_matmult.c
@@ -0,0 +1,162 @@
+/******************************************************************************
+*   OpenMp Example - Matrix Multiply - C Version
+*   Demonstrates a matrix multiply using OpenMP.
+*
+*   Modified from here:
+*   https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c
+*
+*   For  PAPI_FP_INS, the exclusive count for the event:
+*   for (null) [OpenMP location: file:matmult.c ]
+*   should be  2E+06 / Number of Threads
+******************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MATRIX_SIZE
+#define MATRIX_SIZE 4096
+#endif
+
+#define MAX_ITERATIONS 3
+#define NRA MATRIX_SIZE                 /* number of rows in matrix A */
+#define NCA MATRIX_SIZE                 /* number of columns in matrix A */
+#define NCB MATRIX_SIZE                 /* number of columns in matrix B */
+
+#define elem(_m,_i,_j) (_m[((_i)*NRA) + (_j)])
+
+double* allocateMatrix(int rows, int cols) {
+  int i;
+  double *matrix = (double*)malloc((sizeof(double*)) * rows * cols);
+  #pragma omp target enter data map(alloc:matrix[0:rows*cols])
+  return matrix;
+}
+
+void initialize(double *matrix, int rows, int cols) {
+  int i,j;
+#pragma omp parallel private(i,j) shared(matrix)
+  {
+    //set_num_threads();
+    /*** Initialize matrices ***/
+#pragma omp for nowait
+    for (i=0; i<rows; i++) {
+      for (j=0; j<cols; j++) {
+        elem(matrix,i,j)= i+j;
+      }
+    }
+  }
+}
+
+void freeMatrix(double* matrix, int rows, int cols) {
+  #pragma omp target exit data map(delete:matrix[0:rows*cols])
+  free(matrix);
+}
+
+/////////////////////////////////////////////////////////////////////
+// compute multiplies a and b and returns the result in c using ijk.
+// cols_a and rows_b are the same value
+/////////////////////////////////////////////////////////////////////
+void compute(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
+  int i,j,k;
+  printf("%s\n", __func__);
+#pragma omp parallel private(i,j,k) shared(a,b,c)
+  {
+    /*** Do matrix multiply sharing iterations on outer loop ***/
+    /*** Display who does which iterations for demonstration purposes ***/
+#pragma omp for nowait
+    for (i=0; i<rows_a; i++) {
+      for(j=0; j<cols_b; j++) {
+        for (k=0; k<cols_a; k++) {
+          elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
+        }
+      }
+    }
+  }   /*** End of parallel region ***/
+}
+
+///////////////////////////////////////////////////////////////////////
+// compute_interchange multiplies a and b and returns the result in c
+// using ikj loop.  cols_a and rows_b are the same value
+///////////////////////////////////////////////////////////////////////
+void compute_interchange(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
+  int i,j,k;
+  printf("%s\n", __func__);
+#pragma omp parallel private(i,j,k) shared(a,b,c)
+  {
+    /*** Do matrix multiply sharing iterations on outer loop ***/
+    /*** Display who does which iterations for demonstration purposes ***/
+#pragma omp for nowait
+    for (i=0; i<rows_a; i++) {
+      for (k=0; k<cols_a; k++) {
+        for(j=0; j<cols_b; j++) {
+          elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
+        }
+      }
+    }
+  }   /*** End of parallel region ***/
+}
+
+///////////////////////////////////////////////////////////////////////
+// compute_interchange multiplies a and b and returns the result in c
+// using ikj loop.  cols_a and rows_b are the same value
+///////////////////////////////////////////////////////////////////////
+void compute_target(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
+    printf("%s\n", __func__);
+    int i, j, k;
+#pragma omp target data map (to: a[0:rows_a*cols_a],b[0:cols_a*cols_b]) map (tofrom: c[0:rows_a*cols_b])
+#pragma omp target
+#pragma omp teams distribute parallel for collapse(2) private(i,j,k)
+    for (i=0; i<rows_a; i++) {
+      for(j=0; j<cols_b; j++) {
+        for (k=0; k<cols_a; k++) {
+          elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
+        }
+      }
+    }
+#if 0
+    // This is a *very* simple offload statement, for debugging
+    int z = 1;
+#pragma omp target map(tofrom: z)
+    z = z + 1; // The copy of z on the device has a value of 2.
+    printf("After the target region is executed, z = %d\n", z);
+#endif
+}
+
+double do_work(void) {
+  double *a,           /* matrix A to be multiplied */
+  *b,           /* matrix B to be multiplied */
+  *c;           /* result matrix C */
+  a = allocateMatrix(NRA, NCA);
+  b = allocateMatrix(NCA, NCB);
+  c = allocateMatrix(NRA, NCB);
+
+/*** Spawn a parallel region explicitly scoping all variables ***/
+
+  initialize(a, NRA, NCA);
+  initialize(b, NCA, NCB);
+  initialize(c, NRA, NCB);
+
+  // compute(a, b, c, NRA, NCA, NCB);
+  // compute_interchange(a, b, c, NRA, NCA, NCB);
+  compute_target(a, b, c, NRA, NCA, NCB);
+
+  double result = elem(c,0,1);
+
+  freeMatrix(a, NRA, NCA);
+  freeMatrix(b, NCA, NCB);
+  freeMatrix(c, NCA, NCB);
+
+  return result;
+}
+
+int main (int argc, char *argv[])
+{
+  int i;
+  for (i = 0 ; i < MAX_ITERATIONS ; i++) {
+    printf("Iteration %d of %d:...\n", i, MAX_ITERATIONS);
+    do_work();
+  }
+
+  printf ("Done.\n");
+
+  return 0;
+}
+
diff --git a/src/openmp/ompt_target_vector_add.c b/src/openmp/ompt_target_vector_add.c
@@ -0,0 +1,137 @@
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define ARRAY_SIZE     1024*1024*512
+#define ITERATIONS     350
+
+int run_cpu( int argc, char** argv ) {
+  printf( "The total memory allocated is %7.3lf MB.\n",
+          2.0*sizeof(double)*ARRAY_SIZE/1024/1024 );
+
+  double* a          = NULL;
+  double* b          = NULL;
+  int     num_errors = 0;
+  double  time       = 0;
+  double  start_time = 0;
+  double  scalar     = 8.0;
+  int     iterations = ITERATIONS;
+  double  iteration_time[ITERATIONS];
+
+  a = (double *) malloc( sizeof(double)*ARRAY_SIZE );
+  b = (double *) malloc( sizeof(double)*ARRAY_SIZE );
+
+  // initialize on the host
+#pragma omp parallel for
+for (size_t j=0; j<ARRAY_SIZE; j++)
+    {
+      a[j] = 0.0;
+      b[j] = j;
+    }
+
+ start_time = omp_get_wtime();
+
+  for(size_t i=0;i<iterations;i++)
+    {
+      iteration_time[i] = omp_get_wtime();
+#pragma omp parallel for
+      for (size_t j=0; j<ARRAY_SIZE; j++) {
+        a[j] = a[j]+scalar*b[j];
+      }
+      iteration_time[i] = omp_get_wtime() - iteration_time[i];
+    }
+
+  time = omp_get_wtime()-start_time;
+
+  printf("Time (s): %lf\n", time);
+
+  // error checking
+  for (size_t j=0; j<ARRAY_SIZE; j++) {
+      if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001  ) {
+      num_errors++;
+    }
+    }
+
+  free(a);
+  free(b);
+
+  if( num_errors == 0 ) printf( "Success!\n" );
+
+  assert(num_errors == 0);
+
+  return 0;
+}
+
+int run_gpu( int argc, char** argv )
+{
+  printf( "The total memory allocated is %7.3lf MB.\n",
+          2.0*sizeof(double)*ARRAY_SIZE/1024/1024 );
+
+  double* a          = NULL;
+  double* b          = NULL;
+  int     num_errors = 0;
+  double  time       = 0;
+  double  start_time = 0;
+  double  scalar     = 8.0;
+  int     iterations = ITERATIONS;
+  double  iteration_time[ITERATIONS];
+
+  a = (double *) malloc( sizeof(double)*ARRAY_SIZE );
+  b = (double *) malloc( sizeof(double)*ARRAY_SIZE );
+
+  // initialize on the host
+#pragma omp parallel for
+for (size_t j=0; j<ARRAY_SIZE; j++)
+    {
+      a[j] = 0.0;
+      b[j] = j;
+    }
+
+#pragma omp target enter data map(to:a[0:ARRAY_SIZE])
+#pragma omp target enter data map(to:b[0:ARRAY_SIZE])
+
+ start_time = omp_get_wtime();
+
+  for(size_t i=0;i<iterations;i++)
+    {
+      iteration_time[i] = omp_get_wtime();
+#pragma omp target teams distribute parallel for
+      for (size_t j=0; j<ARRAY_SIZE; j++) {
+        a[j] = a[j]+scalar*b[j];
+      }
+      iteration_time[i] = omp_get_wtime() - iteration_time[i];
+    }
+
+  time = omp_get_wtime()-start_time;
+
+#pragma omp target update from(a[0:ARRAY_SIZE])
+
+  printf("Time (s): %lf\n", time);
+
+  // error checking
+  for (size_t j=0; j<ARRAY_SIZE; j++) {
+      if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001  ) {
+      num_errors++;
+    }
+    }
+
+#pragma omp target exit data map(release:a[0:ARRAY_SIZE])
+#pragma omp target exit data map(release:b[0:ARRAY_SIZE])
+
+  free(a);
+  free(b);
+
+  if( num_errors == 0 ) printf( "Success!\n" );
+
+  assert(num_errors == 0);
+
+  return 0;
+}
+
+
+int main( int argc, char** argv ) {
+    run_cpu(argc, argv);
+    run_gpu(argc, argv);
+}