Skip to content

Commit

Permalink
ch4/ofi: Add NIC information to error output
Browse files Browse the repository at this point in the history
On systems with multiple NICs, it could be helpful to know which NIC an
error was detected on in case there are hardware issues that need
investigating. Add NIC information to error checking macros. For now, we
report the default NIC used by each process. TODO: extend to support
multi-nic usage and take the device number as input for more
fine-grained reporting.
  • Loading branch information
raffenet committed Nov 25, 2024
1 parent 7be8fb0 commit e71f330
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 35 deletions.
6 changes: 3 additions & 3 deletions maint/extracterrmsgs
Original file line number Diff line number Diff line change
Expand Up @@ -680,12 +680,12 @@ sub ProcessFile
# add longnames since we omit errnames.txt for these
$longnames{"**ofid_$name"} = "OFI call $name failed";
$longnamesDefined{"**ofid_$name"} = "$filename:$linecount";
$longnames{"**ofid_$name %s"} = "OFI call $name failed (%s)";
$longnamesDefined{"**ofid_$name %s"} = "$filename:$linecount";
$longnames{"**ofid_$name %s %s"} = "OFI call $name failed (dev=%s: %s)";
$longnamesDefined{"**ofid_$name %s %s"} = "$filename:$linecount";
}

$generic_msgs{"**ofid_$name"}++;
$specific_msgs{"**ofid_$name %s"}++;
$specific_msgs{"**ofid_$name %s %s"}++;

next;
}
Expand Down
10 changes: 6 additions & 4 deletions src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
do {
ret = fi_cntr_wait(rcv_cntr, 1, 1);
MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT,
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
mpi_errno, MPI_ERR_RMA_RANGE,
"**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret));
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
MPID_Progress_test(NULL);
} while (ret == -FI_ETIMEDOUT);
} else {
Expand All @@ -89,9 +90,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
do {
ret = fi_cntr_wait(snd_cntr, num_children, 1);
MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT,
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
mpi_errno, MPI_ERR_RMA_RANGE,
"**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret));
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
MPID_Progress_test(NULL);
} while (ret == -FI_ETIMEDOUT);
} else {
Expand Down
10 changes: 6 additions & 4 deletions src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
do {
ret = fi_cntr_wait(rcv_cntr, 1, 1);
MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT,
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
mpi_errno, MPI_ERR_RMA_RANGE,
"**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret));
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
MPID_Progress_test(NULL);
} while (ret == -FI_ETIMEDOUT);
} else {
Expand All @@ -88,9 +89,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
do {
ret = fi_cntr_wait(snd_cntr, num_children, 1);
MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT,
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
mpi_errno, MPI_ERR_RMA_RANGE,
"**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret));
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
MPID_Progress_test(NULL);
} while (ret == -FI_ETIMEDOUT);
} else {
Expand Down
12 changes: 6 additions & 6 deletions src/mpid/ch4/netmod/ofi/errnames.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
# Most of the libfabric call error names are generated from MPIDI_OFI_CALL macros,
# some of them are explicitly used via MPIR_ERR_CHKANDJUMP4, they need be listed here.
**ofid_cancel:OFI cancel failed
**ofid_cancel %s:OFI cancel failed (%s)
**ofid_cancel %s %s:OFI cancel failed (dev=%s: %s)
**ofid_cntr_open:OFI Counter open failed
**ofid_cntr_open %s:OFI OFI Counter open failed (%s)
**ofid_cntr_open %s %s:OFI OFI Counter open failed (dev=%s: %s)
**ofid_cntr_wait:OFI Counter wait failed
**ofid_cntr_wait %s:OFI OFI Counter wait failed (%s)
**ofid_cntr_wait %s %s:OFI OFI Counter wait failed (dev=%s: %s)
**ofid_enable_trigger:OFI triggered ops enable failed
**ofid_enable_trigger %s:OFI triggered ops enable failed (%s)
**ofid_enable_trigger %s %s:OFI triggered ops enable failed (dev=%s: %s)
**ofid_issue_trigger:OFI triggered ops issue failed
**ofid_issue_trigger %s:OFI triggered ops issue failed (%s)
**ofid_issue_trigger %s %s:OFI triggered ops issue failed (dev=%s: %s)
**ofid_poll:OFI poll failed
**ofid_poll %s:OFI poll failed (%s)
**ofid_poll %s %s:OFI poll failed (dev=%s: %s)
16 changes: 10 additions & 6 deletions src/mpid/ch4/netmod/ofi/ofi_events.c
Original file line number Diff line number Diff line change
Expand Up @@ -737,8 +737,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret)
break;

default:
MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s", fi_strerror(e.err));
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME,
fi_strerror(e.err));
}

break;
Expand Down Expand Up @@ -781,15 +783,17 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret)
break;

default:
MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s", fi_strerror(e.err));
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(e.err));
}

break;

default:
MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s", fi_strerror(errno));
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
"**ofid_poll %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(errno));
}

fn_exit:
Expand Down
19 changes: 13 additions & 6 deletions src/mpid/ch4/netmod/ofi/ofi_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ ATTRIBUTE((unused));

#define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi)

#define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic]->domain_attr->name)
#define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0))

int MPIDI_OFI_progress_uninlined(int vci);
int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);

Expand All @@ -55,15 +58,16 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
#define MPIDI_OFI_PROGRESS_WHILE(cond, vci) \
while (cond) MPIDI_OFI_PROGRESS(vci)

#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP1
#define MPIDI_OFI_CALL(FUNC,STR) \
#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP2
#define MPIDI_OFI_CALL(FUNC,STR) \
do { \
ssize_t _ret = FUNC; \
MPIDI_OFI_ERR(_ret<0, \
mpi_errno, \
MPI_ERR_OTHER, \
"**ofid_"#STR, \
"**ofid_"#STR" %s", \
"**ofid_"#STR" %s %s", \
MPIDI_OFI_DEFAULT_NIC_NAME, \
fi_strerror(-_ret)); \
} while (0)

Expand All @@ -78,7 +82,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
mpi_errno, \
MPI_ERR_OTHER, \
"**ofid_"#STR, \
"**ofid_"#STR" %s", \
"**ofid_"#STR" %s %s", \
MPIDI_OFI_DEFAULT_NIC_NAME, \
fi_strerror(-_ret)); \
if (_retry > 0) { \
_retry--; \
Expand Down Expand Up @@ -123,7 +128,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
mpi_errno, \
MPI_ERR_OTHER, \
"**ofid_"#STR, \
"**ofid_"#STR" %s", \
"**ofid_"#STR" %s %s", \
MPIDI_OFI_DEFAULT_NIC_NAME, \
fi_strerror(-_ret)); \
mpi_errno = MPIDI_OFI_progress_do_queue(vci_); \
if (mpi_errno != MPI_SUCCESS) \
Expand Down Expand Up @@ -167,7 +173,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
mpi_errno, \
MPI_ERR_OTHER, \
"**ofid_"#STR, \
"**ofid_"#STR" %s", \
"**ofid_"#STR" %s %s", \
MPIDI_OFI_DEFAULT_NIC_NAME, \
fi_strerror(-_ret)); \
} while (0)

Expand Down
10 changes: 6 additions & 4 deletions src/mpid/ch4/netmod/ofi/ofi_spawn.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s
int rc;
rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context);
if (rc && rc != -FI_ENOENT) {
MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
"**ofid_cancel %s", fi_strerror(-rc));
MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
"**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME,
fi_strerror(-rc));

}
while (!req.done) {
Expand Down Expand Up @@ -111,8 +112,9 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout)
int rc;
rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context);
if (rc && rc != -FI_ENOENT) {
MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
"**ofid_cancel %s", fi_strerror(-rc));
MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
"**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME,
fi_strerror(-rc));

}
while (!req.done) {
Expand Down
5 changes: 3 additions & 2 deletions src/mpid/ch4/netmod/ofi/ofi_win.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci)

if (itercount == 1000 && MPIDI_OFI_COUNTER_WAIT_OBJECTS) {
ret = fi_cntr_wait(MPIDI_OFI_WIN(win).cmpl_cntr, tcount, 0);
MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT,
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
mpi_errno, MPI_ERR_RMA_RANGE,
"**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret));
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
itercount = 0;
DEBUG_PROGRESS_CHECK;
}
Expand Down

0 comments on commit e71f330

Please sign in to comment.