Skip to content

Commit

Permalink
Merge pull request pmodels#6580 from hzhou/2307_pmi_runtime
Browse files Browse the repository at this point in the history
mpir_pmi: allow runtime pmi selection

Approved-by: Ken Raffenetti
  • Loading branch information
hzhou authored Jul 12, 2023
2 parents d1c91bb + 971c06a commit 229b9a9
Show file tree
Hide file tree
Showing 11 changed files with 1,515 additions and 792 deletions.
34 changes: 33 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1621,7 +1621,6 @@ AM_CONDITIONAL([PRIMARY_PM_GFORKER],[test "X$first_pm_name" = "Xgforker"])
AM_CONDITIONAL([PRIMARY_PM_REMSHELL],[test "X$first_pm_name" = "Xremshell"])

# ---- $with_pmilib ----

pmisrcdir=""
AC_SUBST([pmisrcdir])
pmilib=""
Expand Down Expand Up @@ -1700,6 +1699,39 @@ case "$with_pmilib" in
;;
esac


# ---- define ENABLE_PMI[12X] ----
enable_pmi1="no"
enable_pmi2="no"
enable_pmix="no"
if test "$with_pmi" = "pmi2" ; then
enable_pmi2="yes"
elif test "$with_pmi" = "pmix" ; then
enable_pmix="yes"
elif test "$with_pmilib" = "mpich" -o "$with_pmilib" = "install"; then
# mpich's libpmi support both PMI1 and PMI2
enable_pmi1="yes"
enable_pmi2="yes"
else
# detect
AC_CHECK_FUNC([PMI_Init], [enable_pmi1="yes"])
AC_CHECK_FUNC([PMI2_Init], [enable_pmi2="yes"])
AC_CHECK_FUNC([PMIx_Init], [enable_pmix="yes"])
if test "$enable_pmi1" != "yes" -a "$enable_pmi2" != "yes" -a "$enable_pmix" != "yes"; then
AC_MSG_ERROR([Neither PMI, nor PMI2, nor PMIx is enabled.])
fi
fi

if test "$enable_pmi1" = "yes"; then
AC_DEFINE([ENABLE_PMI1], 1, [Define to enable PMI1 protocol])
fi
if test "$enable_pmi2" = "yes"; then
AC_DEFINE([ENABLE_PMI2], 1, [Define to enable PMI2 protocol])
fi
if test "$enable_pmix" = "yes"; then
AC_DEFINE([ENABLE_PMIX], 1, [Define to enable PMIX protocol])
fi

# ---------------------------------------------------------------------------
# Check for whether the compiler defines a symbol that contains the
# function name. The MPICH code uses this for debugging purposes.
Expand Down
29 changes: 12 additions & 17 deletions src/include/mpir_pmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@

#include "mpichconf.h"

#if !defined USE_PMI1_API && !defined USE_PMI2_API && !defined USE_PMIX_API
#define USE_PMI1_API
#endif

#ifdef ENABLE_PMI1
#if defined(USE_PMI1_SLURM)
#include <slurm/pmi.h>

#elif defined(USE_PMI2_SLURM)
#include <slurm/pmi2.h>

#elif defined(USE_PMI2_CRAY)
#include <pmi2.h>

#elif defined(USE_PMI1_API)
#else
#include <pmi.h>
#endif
#endif

#elif defined(USE_PMI2_API)
#ifdef ENABLE_PMI2
#if defined(USE_PMI2_SLURM)
#include <slurm/pmi2.h>
#else
#include <pmi2.h>
#define PMI_keyval_t PMI2_keyval_t
#endif
#endif

#elif defined(USE_PMIX_API)
#ifdef ENABLE_PMIX
#include <pmix.h>
#endif

Expand All @@ -55,7 +51,7 @@ int MPIR_pmi_set_threaded(int is_threaded);
int MPIR_pmi_max_key_size(void);
int MPIR_pmi_max_val_size(void);
const char *MPIR_pmi_job_id(void);
char *MPIR_pmi_get_hwloc_xmlfile(void);
char *MPIR_pmi_get_jobattr(const char *key); /* key must use "PMI_" prefix */

/* PMI wrapper utilities */

Expand Down Expand Up @@ -97,7 +93,6 @@ int MPIR_pmi_unpublish(const char name[]);

/* Other misc functions */
int MPIR_pmi_get_universe_size(int *universe_size);
char *MPIR_pmi_get_failed_procs(void);

struct MPIR_Info; /* forward declare (mpir_info.h) */
int MPIR_pmi_spawn_multiple(int count, char *commands[], char **argvs[],
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/comm/ulfm_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int MPIR_Comm_get_failed_impl(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group_p
int mpi_errno = MPI_SUCCESS;
MPIR_FUNC_ENTER;

char *failed_procs_string = MPIR_pmi_get_failed_procs();
char *failed_procs_string = MPIR_pmi_get_jobattr("PMI_dead_processes");

if (!failed_procs_string) {
*failed_group_ptr = MPIR_Group_empty;
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/ch3/src/ch3u_handle_connection.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ int MPIDI_CH3U_Check_for_failed_procs(void)

MPIR_FUNC_ENTER;

MPIDI_failed_procs_string = MPIR_pmi_get_failed_procs();
MPIDI_failed_procs_string = MPIR_pmi_get_jobattr("PMI_dead_processes");

if (*MPIDI_failed_procs_string == '\0') {
/* there are no failed processes */
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/ch4/src/ch4_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ int MPIDI_check_for_failed_procs(void)
* with the rank, then we need to create the failed group from
* something bigger than comm_world. */

char *failed_procs_string = MPIR_pmi_get_failed_procs();
char *failed_procs_string = MPIR_pmi_get_jobattr("PMI_dead_processes");

if (failed_procs_string) {
MPL_free(failed_procs_string);
Expand Down
2 changes: 2 additions & 0 deletions src/pmi/include/pmi2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
#ifndef PMI2_H_INCLUDED
#define PMI2_H_INCLUDED

#ifndef PMI_VERSION
#define PMI_VERSION 2
#define PMI_SUBVERSION 0
#endif

#define PMI2_MAX_KEYLEN 64
#define PMI2_MAX_VALLEN 1024
Expand Down
2 changes: 1 addition & 1 deletion src/util/mpir_hwtopo.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ int MPII_hwtopo_init(void)
#ifdef HAVE_HWLOC
bindset = hwloc_bitmap_alloc();
hwloc_topology_init(&hwloc_topology);
char *xmlfile = MPIR_pmi_get_hwloc_xmlfile();
char *xmlfile = MPIR_pmi_get_jobattr("PMI_hwloc_xmlfile");
if (xmlfile != NULL) {
int rc;
rc = hwloc_topology_set_xml(hwloc_topology, xmlfile);
Expand Down
Loading

0 comments on commit 229b9a9

Please sign in to comment.