Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenACC multicore + gpu in same binary #1761

Merged
merged 6 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions common/lib/share/mccode-r.c
Original file line number Diff line number Diff line change
Expand Up @@ -3432,7 +3432,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
// than l, resulting in idling. We should distribute lengths more evenly.

// step 1: sort sub-arrays
#pragma acc parallel loop present(particles, pbuffer)
#pragma acc parallel loop present(particles[0:buffer_len], pbuffer[0:buffer_len])
for (unsigned long tidx=0; tidx<SAL_THREADS; tidx++) {
long lo = l*tidx;
long loclen = l;
Expand Down Expand Up @@ -3477,7 +3477,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
}

// step 2: write non-absorbed sub-arrays to psorted/output from the left
#pragma acc parallel loop present(pbuffer)
#pragma acc parallel loop present(pbuffer[0:buffer_len])
for (unsigned long tidx=0; tidx<SAL_THREADS; tidx++) {
long j, k;
#pragma acc loop seq
Expand All @@ -3501,7 +3501,7 @@ long sort_absorb_last(_class_particle* particles, _class_particle* pbuffer, long
return accumlen;

// copy non-absorbed block
#pragma acc parallel loop present(particles)
#pragma acc parallel loop present(particles[0:buffer_len])
for (long tidx = 0; tidx < accumlen; tidx++) { // tidx: thread index
unsigned long randstate[7];
_class_particle sourcebuffer;
Expand Down
69 changes: 41 additions & 28 deletions mcstas/src/cogen.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -1584,14 +1584,12 @@ void def_trace_section(struct instr_def *instr)
cout("/* (Similar defines are available in each comp trace but */");
cout("/* those are not enough to handle external libs etc. ) */");
cout("#ifdef OPENACC");
cout("#ifndef MULTICORE");
cout("#define fprintf(stderr,...) printf(__VA_ARGS__)");
cout("#define sprintf(string,...) printf(__VA_ARGS__)");
cout("#define exit(...) noprintf()");
cout("#define strcmp(a,b) str_comp(a,b)");
cout("#define strlen(a) str_len(a)");
cout("#endif");
cout("#endif");

/* define SCATTERED, ABSORB and RESTORE macros for all TRACE */
coutf("#define %s (_particle->%s)", "SCATTERED", "_scattered");
Expand Down Expand Up @@ -1636,15 +1634,13 @@ void undef_trace_section(struct instr_def *instr)
coutf("#undef %s", statepars_all[i]);

cout("#ifdef OPENACC");
cout("#ifndef MULTICORE");
cout("#undef strlen");
cout("#undef strcmp");
cout("#undef exit");
cout("#undef printf");
cout("#undef sprintf");
cout("#undef fprintf");
cout("#endif");
cout("#endif");
cout("#undef SCATTERED");
cout("#undef RESTORE" );
#if MCCODE_PROJECT == 1 /* neutron */
Expand Down Expand Up @@ -1969,11 +1965,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" unsigned long long loops;");
coutf(" loops = ceil((double)ncount/gpu_innerloop);");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf("");
coutf(" #ifdef OPENACC");
coutf(" if (ncount>gpu_innerloop) {");
Expand All @@ -1993,11 +1992,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" #endif");
coutf("");
coutf(" /* if on GPU, re-nullify printf */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #define printf(...) noprintf()");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf("");
coutf(" #pragma acc parallel loop num_gangs(numgangs) vector_length(vecsize)");
coutf(" for (unsigned long pidx=0 ; pidx < gpu_innerloop ; pidx++) {");
Expand All @@ -2016,11 +2018,14 @@ int cogen_raytrace(struct instr_def *instr)
coutf(" seed = seed+gpu_innerloop;");
coutf(" } /* CPU for */");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
coutf(" MPI_MASTER(");
coutf(" printf(\"*** TRACE end *** \\n\");");
coutf(" );");
Expand Down Expand Up @@ -2072,13 +2077,15 @@ int cogen_rt_funnel(struct instr_def *instr)
coutf(" unsigned long long loops;");
coutf("");
coutf(" /* if on GPU, printf has been globally nullified, re-enable here */");
coutf(" #ifdef OPENACC");
coutf(" #ifndef MULTICORE");
coutf(" #undef printf");
coutf(" #endif");
coutf(" #endif");
coutf("");
/* Check if instrument uses JUMPS */
cout(" #ifdef OPENACC");
cout(" #undef strlen");
cout(" #undef strcmp");
cout(" #undef exit");
cout(" #undef printf");
cout(" #undef sprintf");
cout(" #undef fprintf");
cout(" #endif");
/* Check if instrument uses JUMPS */
liter = list_iterate(instr->complist);
while((comp = list_next(liter)) != NULL) {
if (list_len(comp->jump) > 0) { // JUMP ITERATE counters
Expand Down Expand Up @@ -2123,7 +2130,7 @@ int cogen_rt_funnel(struct instr_def *instr)

// init batch
coutf(" // init particles");
coutf(" #pragma acc parallel loop present(particles)");
coutf(" #pragma acc parallel loop present(particles[0:livebatchsize])");
coutf(" for (unsigned long pidx=0 ; pidx < livebatchsize ; pidx++) {");
coutf(" // generate particle state, set loop index and seed");
coutf(" particles[pidx] = mcgenstate();");
Expand Down Expand Up @@ -2174,7 +2181,11 @@ int cogen_rt_funnel(struct instr_def *instr)
if (first || (comp->cpuonly != cpuonly_last) || comp->split) {
coutf("");
if (comp->cpuonly == 0) {
coutf(" #pragma acc parallel loop present(particles)");
coutf(" #pragma acc parallel loop present(particles[0:livebatchsize])");
} else {
coutf(" #ifdef MULTICORE");
coutf(" #pragma acc parallel loop device_type(host)");
coutf(" #endif");
}
coutf(" for (unsigned long pidx=0 ; pidx < livebatchsize ; pidx++) {");
coutf(" _class_particle* _particle = &particles[pidx];");
Expand All @@ -2186,11 +2197,13 @@ int cogen_rt_funnel(struct instr_def *instr)

// coordinate transformations (wrt to PREVIOUS)
if (comp->skip_transform == 0) {
coutf("#ifndef MULTICORE");
coutf(" if (_%s_var._rotation_is_identity)", comp->name);
coutf(" coords_get("
"coords_add(coords_set(x,y,z), _%s_var._position_relative),"
"&x, &y, &z);", comp->name);
cout( " else");
coutf("#endif");
coutf(" mccoordschange(_%s_var._position_relative, _%s_var._rotation_relative, _particle);", comp->name, comp->name);
cout( " _particle_save = *_particle;");
}
Expand Down