-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Trying to integrate OpenMP target offload for AMD and NVIDIA
- Loading branch information
Showing
3 changed files
with
367 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include <math.h> | ||
#include <assert.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <omp.h> | ||
|
||
void daxpy( double * __restrict__ a, double * __restrict__ b, | ||
double scalar, int num_elements ) { | ||
|
||
#pragma omp target teams distribute parallel for simd map(tofrom:a[0:num_elements]) map(to:b[0:num_elements]) | ||
for (size_t j=0; j<num_elements; j++) { | ||
a[j] = a[j] + b[j] * scalar; | ||
} | ||
|
||
return; | ||
} | ||
|
||
int main( int argc, char** argv ) | ||
{ | ||
double* a = NULL; | ||
double* b = NULL; | ||
double* c = NULL; | ||
double scalar = 8.0; | ||
int num_errors = 0; | ||
int num_elements = 1024; | ||
|
||
a = (double *) malloc( sizeof(double)*num_elements ); | ||
b = (double *) malloc( sizeof(double)*num_elements ); | ||
c = (double *) malloc( sizeof(double)*num_elements ); | ||
|
||
// initialize on the host | ||
for (size_t j=0; j<num_elements; j++) { | ||
a[j] = 0.0; | ||
c[j] = 0.0; | ||
b[j] = j; | ||
} | ||
|
||
#pragma omp target enter data map(to:a[0:num_elements]) | ||
#pragma omp target enter data map(to:b[0:num_elements]) | ||
#pragma omp target enter data map(to:c[0:num_elements]) | ||
|
||
daxpy( a, b, scalar, num_elements ); | ||
|
||
daxpy( c, a, scalar, num_elements ); | ||
|
||
#pragma omp target update from(c[0:num_elements]) | ||
|
||
// error checking | ||
for (size_t j=0; j<num_elements; j++) { | ||
if( fabs(c[j] - (double)j*scalar*scalar) > 0.000001 ) { | ||
num_errors++; | ||
} | ||
} | ||
|
||
#pragma omp target exit data map(release:c[0:num_elements]) | ||
#pragma omp target exit data map(release:a[0:num_elements]) | ||
#pragma omp target exit data map(release:b[0:num_elements]) | ||
|
||
free(a); | ||
free(b); | ||
free(c); | ||
|
||
if(num_errors == 0) printf( "Success!\n" ); | ||
|
||
assert(num_errors == 0); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/****************************************************************************** | ||
* OpenMp Example - Matrix Multiply - C Version | ||
* Demonstrates a matrix multiply using OpenMP. | ||
* | ||
* Modified from here: | ||
* https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c | ||
* | ||
* For PAPI_FP_INS, the exclusive count for the event: | ||
* for (null) [OpenMP location: file:matmult.c ] | ||
* should be 2E+06 / Number of Threads | ||
******************************************************************************/ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
|
||
#ifndef MATRIX_SIZE | ||
#define MATRIX_SIZE 4096 | ||
#endif | ||
|
||
#define MAX_ITERATIONS 3 | ||
#define NRA MATRIX_SIZE /* number of rows in matrix A */ | ||
#define NCA MATRIX_SIZE /* number of columns in matrix A */ | ||
#define NCB MATRIX_SIZE /* number of columns in matrix B */ | ||
|
||
#define elem(_m,_i,_j) (_m[((_i)*NRA) + (_j)]) | ||
|
||
double* allocateMatrix(int rows, int cols) { | ||
int i; | ||
double *matrix = (double*)malloc((sizeof(double*)) * rows * cols); | ||
#pragma omp target enter data map(alloc:matrix[0:rows*cols]) | ||
return matrix; | ||
} | ||
|
||
void initialize(double *matrix, int rows, int cols) { | ||
int i,j; | ||
#pragma omp parallel private(i,j) shared(matrix) | ||
{ | ||
//set_num_threads(); | ||
/*** Initialize matrices ***/ | ||
#pragma omp for nowait | ||
for (i=0; i<rows; i++) { | ||
for (j=0; j<cols; j++) { | ||
elem(matrix,i,j)= i+j; | ||
} | ||
} | ||
} | ||
} | ||
|
||
void freeMatrix(double* matrix, int rows, int cols) { | ||
#pragma omp target exit data map(delete:matrix[0:rows*cols]) | ||
free(matrix); | ||
} | ||
|
||
///////////////////////////////////////////////////////////////////// | ||
// compute multiplies a and b and returns the result in c using ijk. | ||
// cols_a and rows_b are the same value | ||
///////////////////////////////////////////////////////////////////// | ||
void compute(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) { | ||
int i,j,k; | ||
printf("%s\n", __func__); | ||
#pragma omp parallel private(i,j,k) shared(a,b,c) | ||
{ | ||
/*** Do matrix multiply sharing iterations on outer loop ***/ | ||
/*** Display who does which iterations for demonstration purposes ***/ | ||
#pragma omp for nowait | ||
for (i=0; i<rows_a; i++) { | ||
for(j=0; j<cols_b; j++) { | ||
for (k=0; k<cols_a; k++) { | ||
elem(c,i,j) += elem(a,i,k) * elem(b,k,j); | ||
} | ||
} | ||
} | ||
} /*** End of parallel region ***/ | ||
} | ||
|
||
/////////////////////////////////////////////////////////////////////// | ||
// compute_interchange multiplies a and b and returns the result in c | ||
// using ikj loop. cols_a and rows_b are the same value | ||
/////////////////////////////////////////////////////////////////////// | ||
void compute_interchange(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) { | ||
int i,j,k; | ||
printf("%s\n", __func__); | ||
#pragma omp parallel private(i,j,k) shared(a,b,c) | ||
{ | ||
/*** Do matrix multiply sharing iterations on outer loop ***/ | ||
/*** Display who does which iterations for demonstration purposes ***/ | ||
#pragma omp for nowait | ||
for (i=0; i<rows_a; i++) { | ||
for (k=0; k<cols_a; k++) { | ||
for(j=0; j<cols_b; j++) { | ||
elem(c,i,j) += elem(a,i,k) * elem(b,k,j); | ||
} | ||
} | ||
} | ||
} /*** End of parallel region ***/ | ||
} | ||
|
||
/////////////////////////////////////////////////////////////////////// | ||
// compute_interchange multiplies a and b and returns the result in c | ||
// using ikj loop. cols_a and rows_b are the same value | ||
/////////////////////////////////////////////////////////////////////// | ||
void compute_target(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) { | ||
printf("%s\n", __func__); | ||
int i, j, k; | ||
#pragma omp target data map (to: a[0:rows_a*cols_a],b[0:cols_a*cols_b]) map (tofrom: c[0:rows_a*cols_b]) | ||
#pragma omp target | ||
#pragma omp teams distribute parallel for collapse(2) private(i,j,k) | ||
for (i=0; i<rows_a; i++) { | ||
for(j=0; j<cols_b; j++) { | ||
for (k=0; k<cols_a; k++) { | ||
elem(c,i,j) += elem(a,i,k) * elem(b,k,j); | ||
} | ||
} | ||
} | ||
#if 0 | ||
// This is a *very* simple offload statement, for debugging | ||
int z = 1; | ||
#pragma omp target map(tofrom: z) | ||
z = z + 1; // The copy of z on the device has a value of 2. | ||
printf("After the target region is executed, z = %d\n", z); | ||
#endif | ||
} | ||
|
||
double do_work(void) { | ||
double *a, /* matrix A to be multiplied */ | ||
*b, /* matrix B to be multiplied */ | ||
*c; /* result matrix C */ | ||
a = allocateMatrix(NRA, NCA); | ||
b = allocateMatrix(NCA, NCB); | ||
c = allocateMatrix(NRA, NCB); | ||
|
||
/*** Spawn a parallel region explicitly scoping all variables ***/ | ||
|
||
initialize(a, NRA, NCA); | ||
initialize(b, NCA, NCB); | ||
initialize(c, NRA, NCB); | ||
|
||
// compute(a, b, c, NRA, NCA, NCB); | ||
// compute_interchange(a, b, c, NRA, NCA, NCB); | ||
compute_target(a, b, c, NRA, NCA, NCB); | ||
|
||
double result = elem(c,0,1); | ||
|
||
freeMatrix(a, NRA, NCA); | ||
freeMatrix(b, NCA, NCB); | ||
freeMatrix(c, NCA, NCB); | ||
|
||
return result; | ||
} | ||
|
||
int main (int argc, char *argv[]) | ||
{ | ||
int i; | ||
for (i = 0 ; i < MAX_ITERATIONS ; i++) { | ||
printf("Iteration %d of %d:...\n", i, MAX_ITERATIONS); | ||
do_work(); | ||
} | ||
|
||
printf ("Done.\n"); | ||
|
||
return 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
#include <math.h> | ||
#include <assert.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <omp.h> | ||
|
||
#define ARRAY_SIZE 1024*1024*512 | ||
#define ITERATIONS 350 | ||
|
||
int run_cpu( int argc, char** argv ) { | ||
printf( "The total memory allocated is %7.3lf MB.\n", | ||
2.0*sizeof(double)*ARRAY_SIZE/1024/1024 ); | ||
|
||
double* a = NULL; | ||
double* b = NULL; | ||
int num_errors = 0; | ||
double time = 0; | ||
double start_time = 0; | ||
double scalar = 8.0; | ||
int iterations = ITERATIONS; | ||
double iteration_time[ITERATIONS]; | ||
|
||
a = (double *) malloc( sizeof(double)*ARRAY_SIZE ); | ||
b = (double *) malloc( sizeof(double)*ARRAY_SIZE ); | ||
|
||
// initialize on the host | ||
#pragma omp parallel for | ||
for (size_t j=0; j<ARRAY_SIZE; j++) | ||
{ | ||
a[j] = 0.0; | ||
b[j] = j; | ||
} | ||
|
||
start_time = omp_get_wtime(); | ||
|
||
for(size_t i=0;i<iterations;i++) | ||
{ | ||
iteration_time[i] = omp_get_wtime(); | ||
#pragma omp parallel for | ||
for (size_t j=0; j<ARRAY_SIZE; j++) { | ||
a[j] = a[j]+scalar*b[j]; | ||
} | ||
iteration_time[i] = omp_get_wtime() - iteration_time[i]; | ||
} | ||
|
||
time = omp_get_wtime()-start_time; | ||
|
||
printf("Time (s): %lf\n", time); | ||
|
||
// error checking | ||
for (size_t j=0; j<ARRAY_SIZE; j++) { | ||
if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001 ) { | ||
num_errors++; | ||
} | ||
} | ||
|
||
free(a); | ||
free(b); | ||
|
||
if( num_errors == 0 ) printf( "Success!\n" ); | ||
|
||
assert(num_errors == 0); | ||
|
||
return 0; | ||
} | ||
|
||
int run_gpu( int argc, char** argv ) | ||
{ | ||
printf( "The total memory allocated is %7.3lf MB.\n", | ||
2.0*sizeof(double)*ARRAY_SIZE/1024/1024 ); | ||
|
||
double* a = NULL; | ||
double* b = NULL; | ||
int num_errors = 0; | ||
double time = 0; | ||
double start_time = 0; | ||
double scalar = 8.0; | ||
int iterations = ITERATIONS; | ||
double iteration_time[ITERATIONS]; | ||
|
||
a = (double *) malloc( sizeof(double)*ARRAY_SIZE ); | ||
b = (double *) malloc( sizeof(double)*ARRAY_SIZE ); | ||
|
||
// initialize on the host | ||
#pragma omp parallel for | ||
for (size_t j=0; j<ARRAY_SIZE; j++) | ||
{ | ||
a[j] = 0.0; | ||
b[j] = j; | ||
} | ||
|
||
#pragma omp target enter data map(to:a[0:ARRAY_SIZE]) | ||
#pragma omp target enter data map(to:b[0:ARRAY_SIZE]) | ||
|
||
start_time = omp_get_wtime(); | ||
|
||
for(size_t i=0;i<iterations;i++) | ||
{ | ||
iteration_time[i] = omp_get_wtime(); | ||
#pragma omp target teams distribute parallel for | ||
for (size_t j=0; j<ARRAY_SIZE; j++) { | ||
a[j] = a[j]+scalar*b[j]; | ||
} | ||
iteration_time[i] = omp_get_wtime() - iteration_time[i]; | ||
} | ||
|
||
time = omp_get_wtime()-start_time; | ||
|
||
#pragma omp target update from(a[0:ARRAY_SIZE]) | ||
|
||
printf("Time (s): %lf\n", time); | ||
|
||
// error checking | ||
for (size_t j=0; j<ARRAY_SIZE; j++) { | ||
if( fabs(a[j] - (double)j*iterations*scalar) > 0.000001 ) { | ||
num_errors++; | ||
} | ||
} | ||
|
||
#pragma omp target exit data map(release:a[0:ARRAY_SIZE]) | ||
#pragma omp target exit data map(release:b[0:ARRAY_SIZE]) | ||
|
||
free(a); | ||
free(b); | ||
|
||
if( num_errors == 0 ) printf( "Success!\n" ); | ||
|
||
assert(num_errors == 0); | ||
|
||
return 0; | ||
} | ||
|
||
|
||
int main( int argc, char** argv ) { | ||
run_cpu(argc, argv); | ||
run_gpu(argc, argv); | ||
} |