Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-17111 cart: Fix csm_alive_count #15945

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 44 additions & 18 deletions src/cart/crt_swim.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -179,6 +180,12 @@ crt_swim_membs_iterate(struct crt_swim_membs *csm, d_hash_traverse_cb_t cb, void
return d_hash_table_traverse(csm->csm_table, cb, arg);
}

static inline bool
crt_swim_status_alive_or_suspect(enum swim_member_status status)
{
return status == SWIM_MEMBER_ALIVE || status == SWIM_MEMBER_SUSPECT;
}

/* Move cst into the csm. */
static int
crt_swim_membs_add(struct crt_swim_membs *csm, struct crt_swim_target *cst)
Expand Down Expand Up @@ -214,6 +221,9 @@ crt_swim_membs_add(struct crt_swim_membs *csm, struct crt_swim_target *cst)
if (csm->csm_target == CRT_SWIM_TARGET_INVALID)
csm->csm_target = 0;

if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status))
csm->csm_alive_or_suspect_count++;

return 0;
}

Expand Down Expand Up @@ -256,6 +266,9 @@ crt_swim_membs_del(struct crt_swim_membs *csm, d_rank_t rank)
deleted = d_hash_rec_delete_at(csm->csm_table, &cst->cst_link);
D_ASSERT(deleted);

if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status))
csm->csm_alive_or_suspect_count--;

return cst;
}

Expand Down Expand Up @@ -952,12 +965,12 @@ static int crt_swim_set_member_state(struct swim_context *ctx,
crt_swim_csm_lock(csm);
cst = crt_swim_membs_find(csm, id);
if (cst != NULL && state->sms_incarnation >= cst->cst_state.sms_incarnation) {
if (cst->cst_state.sms_status != SWIM_MEMBER_ALIVE &&
state->sms_status == SWIM_MEMBER_ALIVE)
csm->csm_alive_count++;
else if (cst->cst_state.sms_status == SWIM_MEMBER_ALIVE &&
state->sms_status != SWIM_MEMBER_ALIVE)
csm->csm_alive_count--;
if (!crt_swim_status_alive_or_suspect(cst->cst_state.sms_status) &&
crt_swim_status_alive_or_suspect(state->sms_status))
csm->csm_alive_or_suspect_count++;
else if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status) &&
!crt_swim_status_alive_or_suspect(state->sms_status))
csm->csm_alive_or_suspect_count--;
state_prev = cst->cst_state;
cst->cst_state = *state;
rc = 0;
Expand Down Expand Up @@ -1057,7 +1070,7 @@ static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, v
* The max_delay should be less suspicion timeout to guarantee
* the already suspected members will not be expired.
*/
if (csm->csm_alive_count > 2) {
if (csm->csm_alive_or_suspect_count > 2) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you figured out that why it's okay for us to not update/extend the suspecting timeout if the number of alive_or_suspect_count is less or equal than 2? I tend to think it's applicable no matter how many ranks there are.

uint64_t hlc1 = csm->csm_last_unpack_hlc;
uint64_t hlc2 = d_hlc_get();
uint64_t delay = d_hlc2msec(hlc2 - hlc1);
Expand Down Expand Up @@ -1150,7 +1163,7 @@ int crt_swim_init(int crt_ctx_idx)

csm->csm_crt_ctx_idx = crt_ctx_idx;
csm->csm_last_unpack_hlc = hlc;
csm->csm_alive_count = 0;
csm->csm_alive_or_suspect_count = 0;
csm->csm_nglitches = 0;
csm->csm_nmessages = 0;
/*
Expand Down Expand Up @@ -1350,31 +1363,42 @@ void crt_swim_disable_all(void)
old_ctx_idx, NULL);
}

struct crt_swim_suspend_arg {
struct crt_swim_membs *csm;
swim_id_t self_id;
};

static int
crt_swim_suspend_cb(d_list_t *link, void *arg)
crt_swim_suspend_cb(d_list_t *link, void *varg)
{
struct crt_swim_target *cst = crt_swim_target_obj(link);
swim_id_t *self_id = arg;
struct crt_swim_target *cst = crt_swim_target_obj(link);
struct crt_swim_suspend_arg *arg = varg;

if (cst->cst_id != *self_id)
if (cst->cst_id != arg->self_id) {
if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status))
arg->csm->csm_alive_or_suspect_count--;
cst->cst_state.sms_status = SWIM_MEMBER_INACTIVE;
}
return 0;
}

void crt_swim_suspend_all(void)
{
struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp;
struct crt_swim_membs *csm = &grp_priv->gp_membs_swim;
swim_id_t self_id;
int rc;
struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp;
struct crt_swim_membs *csm = &grp_priv->gp_membs_swim;
struct crt_swim_suspend_arg arg;
int rc;

if (!crt_gdata.cg_swim_inited)
return;

csm->csm_ctx->sc_glitch = 1;
self_id = swim_self_get(csm->csm_ctx);

arg.csm = csm;
arg.self_id = swim_self_get(csm->csm_ctx);

crt_swim_csm_lock(csm);
rc = crt_swim_membs_iterate(csm, crt_swim_suspend_cb, &self_id);
rc = crt_swim_membs_iterate(csm, crt_swim_suspend_cb, &arg);
D_ASSERTF(rc == 0, "suspend SWIM members: "DF_RC"\n", DP_RC(rc));
crt_swim_csm_unlock(csm);
}
Expand Down Expand Up @@ -1625,6 +1649,8 @@ crt_swim_rank_check(struct crt_grp_priv *grp_priv, d_rank_t rank, uint64_t incar
if (cst->cst_state.sms_incarnation < incarnation) {
state_prev = cst->cst_state;
cst->cst_state.sms_incarnation = incarnation;
if (!crt_swim_status_alive_or_suspect(cst->cst_state.sms_status))
csm->csm_alive_or_suspect_count++;
cst->cst_state.sms_status = SWIM_MEMBER_ALIVE;
state = cst->cst_state;
updated = true;
Expand Down
3 changes: 2 additions & 1 deletion src/cart/crt_swim.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2019-2022 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -47,7 +48,7 @@ struct crt_swim_membs {
struct swim_context *csm_ctx;
uint64_t csm_incarnation;
uint64_t csm_last_unpack_hlc;
uint64_t csm_alive_count;
uint64_t csm_alive_or_suspect_count;
int csm_crt_ctx_idx;
int csm_nglitches;
int csm_nmessages;
Expand Down