Skip to content

Commit

Permalink
Trying to integrate OpenMP target offload for AMD and NVIDIA
Browse files Browse the repository at this point in the history
  • Loading branch information
khuck committed Jul 15, 2021
1 parent 64c4f4c commit 4794748
Show file tree
Hide file tree
Showing 3 changed files with 367 additions and 0 deletions.
68 changes: 68 additions & 0 deletions src/openmp/ompt_target_daxpy.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void daxpy( double * __restrict__ a, double * __restrict__ b,
double scalar, int num_elements ) {

#pragma omp target teams distribute parallel for simd map(tofrom:a[0:num_elements]) map(to:b[0:num_elements])
for (size_t j=0; j<num_elements; j++) {
a[j] = a[j] + b[j] * scalar;
}

return;
}

int main( int argc, char** argv )
{
double* a = NULL;
double* b = NULL;
double* c = NULL;
double scalar = 8.0;
int num_errors = 0;
int num_elements = 1024;

a = (double *) malloc( sizeof(double)*num_elements );
b = (double *) malloc( sizeof(double)*num_elements );
c = (double *) malloc( sizeof(double)*num_elements );

// initialize on the host
for (size_t j=0; j<num_elements; j++) {
a[j] = 0.0;
c[j] = 0.0;
b[j] = j;
}

#pragma omp target enter data map(to:a[0:num_elements])
#pragma omp target enter data map(to:b[0:num_elements])
#pragma omp target enter data map(to:c[0:num_elements])

daxpy( a, b, scalar, num_elements );

daxpy( c, a, scalar, num_elements );

#pragma omp target update from(c[0:num_elements])

// error checking
for (size_t j=0; j<num_elements; j++) {
if( fabs(c[j] - (double)j*scalar*scalar) > 0.000001 ) {
num_errors++;
}
}

#pragma omp target exit data map(release:c[0:num_elements])
#pragma omp target exit data map(release:a[0:num_elements])
#pragma omp target exit data map(release:b[0:num_elements])

free(a);
free(b);
free(c);

if(num_errors == 0) printf( "Success!\n" );

assert(num_errors == 0);

return 0;
}
162 changes: 162 additions & 0 deletions src/openmp/ompt_target_matmult.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/******************************************************************************
* OpenMp Example - Matrix Multiply - C Version
* Demonstrates a matrix multiply using OpenMP.
*
* Modified from here:
* https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c
*
* For PAPI_FP_INS, the exclusive count for the event:
* for (null) [OpenMP location: file:matmult.c ]
* should be 2E+06 / Number of Threads
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>

#ifndef MATRIX_SIZE
#define MATRIX_SIZE 4096
#endif

#define MAX_ITERATIONS 3
#define NRA MATRIX_SIZE /* number of rows in matrix A */
#define NCA MATRIX_SIZE /* number of columns in matrix A */
#define NCB MATRIX_SIZE /* number of columns in matrix B */

#define elem(_m,_i,_j) (_m[((_i)*NRA) + (_j)])

double* allocateMatrix(int rows, int cols) {
int i;
double *matrix = (double*)malloc((sizeof(double*)) * rows * cols);
#pragma omp target enter data map(alloc:matrix[0:rows*cols])
return matrix;
}

void initialize(double *matrix, int rows, int cols) {
int i,j;
#pragma omp parallel private(i,j) shared(matrix)
{
//set_num_threads();
/*** Initialize matrices ***/
#pragma omp for nowait
for (i=0; i<rows; i++) {
for (j=0; j<cols; j++) {
elem(matrix,i,j)= i+j;
}
}
}
}

void freeMatrix(double* matrix, int rows, int cols) {
#pragma omp target exit data map(delete:matrix[0:rows*cols])
free(matrix);
}

/////////////////////////////////////////////////////////////////////
// compute multiplies a and b and returns the result in c using ijk.
// cols_a and rows_b are the same value
/////////////////////////////////////////////////////////////////////
void compute(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
int i,j,k;
printf("%s\n", __func__);
#pragma omp parallel private(i,j,k) shared(a,b,c)
{
/*** Do matrix multiply sharing iterations on outer loop ***/
/*** Display who does which iterations for demonstration purposes ***/
#pragma omp for nowait
for (i=0; i<rows_a; i++) {
for(j=0; j<cols_b; j++) {
for (k=0; k<cols_a; k++) {
elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
}
}
}
} /*** End of parallel region ***/
}

///////////////////////////////////////////////////////////////////////
// compute_interchange multiplies a and b and returns the result in c
// using ikj loop. cols_a and rows_b are the same value
///////////////////////////////////////////////////////////////////////
void compute_interchange(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
int i,j,k;
printf("%s\n", __func__);
#pragma omp parallel private(i,j,k) shared(a,b,c)
{
/*** Do matrix multiply sharing iterations on outer loop ***/
/*** Display who does which iterations for demonstration purposes ***/
#pragma omp for nowait
for (i=0; i<rows_a; i++) {
for (k=0; k<cols_a; k++) {
for(j=0; j<cols_b; j++) {
elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
}
}
}
} /*** End of parallel region ***/
}

///////////////////////////////////////////////////////////////////////
// compute_interchange multiplies a and b and returns the result in c
// using ikj loop. cols_a and rows_b are the same value
///////////////////////////////////////////////////////////////////////
void compute_target(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
printf("%s\n", __func__);
int i, j, k;
#pragma omp target data map (to: a[0:rows_a*cols_a],b[0:cols_a*cols_b]) map (tofrom: c[0:rows_a*cols_b])
#pragma omp target
#pragma omp teams distribute parallel for collapse(2) private(i,j,k)
for (i=0; i<rows_a; i++) {
for(j=0; j<cols_b; j++) {
for (k=0; k<cols_a; k++) {
elem(c,i,j) += elem(a,i,k) * elem(b,k,j);
}
}
}
#if 0
// This is a *very* simple offload statement, for debugging
int z = 1;
#pragma omp target map(tofrom: z)
z = z + 1; // The copy of z on the device has a value of 2.
printf("After the target region is executed, z = %d\n", z);
#endif
}

double do_work(void) {
double *a, /* matrix A to be multiplied */
*b, /* matrix B to be multiplied */
*c; /* result matrix C */
a = allocateMatrix(NRA, NCA);
b = allocateMatrix(NCA, NCB);
c = allocateMatrix(NRA, NCB);

/*** Spawn a parallel region explicitly scoping all variables ***/

initialize(a, NRA, NCA);
initialize(b, NCA, NCB);
initialize(c, NRA, NCB);

// compute(a, b, c, NRA, NCA, NCB);
// compute_interchange(a, b, c, NRA, NCA, NCB);
compute_target(a, b, c, NRA, NCA, NCB);

double result = elem(c,0,1);

freeMatrix(a, NRA, NCA);
freeMatrix(b, NCA, NCB);
freeMatrix(c, NCA, NCB);

return result;
}

int main (int argc, char *argv[])
{
int i;
for (i = 0 ; i < MAX_ITERATIONS ; i++) {
printf("Iteration %d of %d:...\n", i, MAX_ITERATIONS);
do_work();
}

printf ("Done.\n");

return 0;
}

137 changes: 137 additions & 0 deletions src/openmp/ompt_target_vector_add.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define ARRAY_SIZE 1024*1024*512
#define ITERATIONS 350

int run_cpu( int argc, char** argv ) {
printf( "The total memory allocated is %7.3lf MB.\n",
2.0*sizeof(double)*ARRAY_SIZE/1024/1024 );

double* a = NULL;
double* b = NULL;
int num_errors = 0;
double time = 0;
double start_time = 0;
double scalar = 8.0;
int iterations = ITERATIONS;
double iteration_time[ITERATIONS];

a = (double *) malloc( sizeof(double)*ARRAY_SIZE );
b = (double *) malloc( sizeof(double)*ARRAY_SIZE );

// initialize on the host
#pragma omp parallel for
for (size_t j=0; j<ARRAY_SIZE; j++)
{
a[j] = 0.0;
b[j] = j;
}

start_time = omp_get_wtime();

for(size_t i=0;i<iterations;i++)
{
iteration_time[i] = omp_get_wtime();
#pragma omp parallel for
for (size_t j=0; j<ARRAY_SIZE; j++) {
a[j] = a[j]+scalar*b[j];
}
iteration_time[i] = omp_get_wtime() - iteration_time[i];
}

time = omp_get_wtime()-start_time;

printf("Time (s): %lf\n", time);

// error checking
for (size_t j=0; j<ARRAY_SIZE; j++) {
if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001 ) {
num_errors++;
}
}

free(a);
free(b);

if( num_errors == 0 ) printf( "Success!\n" );

assert(num_errors == 0);

return 0;
}

int run_gpu( int argc, char** argv )
{
printf( "The total memory allocated is %7.3lf MB.\n",
2.0*sizeof(double)*ARRAY_SIZE/1024/1024 );

double* a = NULL;
double* b = NULL;
int num_errors = 0;
double time = 0;
double start_time = 0;
double scalar = 8.0;
int iterations = ITERATIONS;
double iteration_time[ITERATIONS];

a = (double *) malloc( sizeof(double)*ARRAY_SIZE );
b = (double *) malloc( sizeof(double)*ARRAY_SIZE );

// initialize on the host
#pragma omp parallel for
for (size_t j=0; j<ARRAY_SIZE; j++)
{
a[j] = 0.0;
b[j] = j;
}

#pragma omp target enter data map(to:a[0:ARRAY_SIZE])
#pragma omp target enter data map(to:b[0:ARRAY_SIZE])

start_time = omp_get_wtime();

for(size_t i=0;i<iterations;i++)
{
iteration_time[i] = omp_get_wtime();
#pragma omp target teams distribute parallel for
for (size_t j=0; j<ARRAY_SIZE; j++) {
a[j] = a[j]+scalar*b[j];
}
iteration_time[i] = omp_get_wtime() - iteration_time[i];
}

time = omp_get_wtime()-start_time;

#pragma omp target update from(a[0:ARRAY_SIZE])

printf("Time (s): %lf\n", time);

// error checking
for (size_t j=0; j<ARRAY_SIZE; j++) {
if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001 ) {
num_errors++;
}
}

#pragma omp target exit data map(release:a[0:ARRAY_SIZE])
#pragma omp target exit data map(release:b[0:ARRAY_SIZE])

free(a);
free(b);

if( num_errors == 0 ) printf( "Success!\n" );

assert(num_errors == 0);

return 0;
}


int main( int argc, char** argv ) {
run_cpu(argc, argv);
run_gpu(argc, argv);
}

0 comments on commit 4794748

Please sign in to comment.