Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions nodescraper/plugins/inband/fabrics/fabrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
MstDevice,
MstStatus,
OfedInfo,
SlingshotData,
)


Expand All @@ -51,6 +52,11 @@ class FabricsCollector(InBandDataCollector[FabricsDataModel, None]):
CMD_OFED_INFO = "ofed_info -s"
CMD_MST_START = "mst start"
CMD_MST_STATUS = "mst status -v"
CMD_CASSINI_PCI = "lspci | grep -i cassini"
CMD_NET_LINK = "ip link show"
CMD_LIBFABRIC_INFO = "fi_info -p cxi"
CMD_CXI_STAT = "cxi_stat"
CMD_CXI_MODULES = "lsmod | grep cxi"

def _parse_ibstat(self, output: str) -> List[IbstatDevice]:
"""Parse 'ibstat' output into IbstatDevice objects.
Expand Down Expand Up @@ -406,6 +412,7 @@ def collect_data(
ibdev_netdev_mappings = []
ofed_info = None
mst_status = None
slingshot_data = None

# Collect ibstat information
res_ibstat = self._run_sut_cmd(self.CMD_IBSTAT)
Expand Down Expand Up @@ -522,24 +529,90 @@ def collect_data(
priority=EventPriority.INFO,
)

# Slingshot fallback path:
# if no InfiniBand data was collected, probe for Cassini/CXI fabric.
ib_data_collected = bool(ibstat_devices or ibv_devices or ibdev_netdev_mappings)
if not ib_data_collected:
res_cassini = self._run_sut_cmd(self.CMD_CASSINI_PCI)
cassini_detected = res_cassini.exit_code == 0 and bool(res_cassini.stdout.strip())

if cassini_detected:
self._log_event(
category=EventCategory.NETWORK,
description="Detected Slingshot/Cassini fabrics hardware",
priority=EventPriority.INFO,
)

res_net_link = self._run_sut_cmd(self.CMD_NET_LINK)
res_libfabric = self._run_sut_cmd(self.CMD_LIBFABRIC_INFO)
res_cxi_stat = self._run_sut_cmd(self.CMD_CXI_STAT)
res_cxi_modules = self._run_sut_cmd(self.CMD_CXI_MODULES)

slingshot_data = SlingshotData(
cassini_pci=res_cassini.stdout,
net_link=res_net_link.stdout if res_net_link.exit_code == 0 else None,
libfabric_info=res_libfabric.stdout if res_libfabric.exit_code == 0 else None,
cxi_stat=res_cxi_stat.stdout if res_cxi_stat.exit_code == 0 else None,
cxi_modules=res_cxi_modules.stdout if res_cxi_modules.exit_code == 0 else None,
)

failed_cmds = []
for cmd_name, cmd_res in (
("ip link show", res_net_link),
("fi_info -p cxi", res_libfabric),
("cxi_stat", res_cxi_stat),
("lsmod | grep cxi", res_cxi_modules),
):
if cmd_res.exit_code != 0:
failed_cmds.append(cmd_name)

if failed_cmds:
self._log_event(
category=EventCategory.NETWORK,
description="Some Slingshot commands failed",
data={"failed_commands": failed_cmds},
priority=EventPriority.WARNING,
)
else:
self._log_event(
category=EventCategory.NETWORK,
description="No Slingshot/Cassini hardware detected on this system",
data={
"command": res_cassini.command,
"exit_code": res_cassini.exit_code,
},
priority=EventPriority.INFO,
)

# Build the data model only if we collected any data
if ibstat_devices or ibv_devices or ibdev_netdev_mappings or ofed_info or mst_status:
if (
ibstat_devices
or ibv_devices
or ibdev_netdev_mappings
or ofed_info
or mst_status
or slingshot_data
):
fabrics_data = FabricsDataModel(
ibstat_devices=ibstat_devices,
ibv_devices=ibv_devices,
ibdev_netdev_mappings=ibdev_netdev_mappings,
ofed_info=ofed_info,
mst_status=mst_status,
slingshot_data=slingshot_data,
)
self.result.message = (
f"Collected fabrics data: {len(ibstat_devices)} ibstat devices, "
f"{len(ibv_devices)} ibv devices, {len(ibdev_netdev_mappings)} mappings, "
f"OFED: {ofed_info.version if ofed_info else 'N/A'}, "
f"MST devices: {len(mst_status.devices) if mst_status else 0}"
f"MST devices: {len(mst_status.devices) if mst_status else 0}, "
f"Slingshot: {'detected' if slingshot_data else 'not detected'}"
)
self.result.status = ExecutionStatus.OK
return self.result, fabrics_data
else:
self.result.message = "No InfiniBand/RDMA fabrics hardware detected on this system"
self.result.message = (
"No InfiniBand/RDMA or Slingshot fabrics hardware detected on this system"
)
self.result.status = ExecutionStatus.NOT_RAN
return self.result, None
11 changes: 11 additions & 0 deletions nodescraper/plugins/inband/fabrics/fabricsdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ class MstStatus(BaseModel):
raw_output: str = "" # Raw command output


class SlingshotData(BaseModel):
"""Slingshot/Cassini fabrics command outputs"""

cassini_pci: str = "" # Output of lspci Cassini probe
net_link: Optional[str] = None # Output of ip link show
libfabric_info: Optional[str] = None # Output of fi_info -p cxi
cxi_stat: Optional[str] = None # Output of cxi_stat
cxi_modules: Optional[str] = None # Output of lsmod | grep cxi


class FabricsDataModel(DataModel):
"""Complete InfiniBand/RDMA fabrics configuration data"""

Expand All @@ -106,3 +116,4 @@ class FabricsDataModel(DataModel):
) # ibdev2netdev output
ofed_info: Optional[OfedInfo] = None # OFED version info
mst_status: Optional[MstStatus] = None # MST status
slingshot_data: Optional[SlingshotData] = None # Slingshot/Cassini command outputs
54 changes: 54 additions & 0 deletions test/unit/plugin/test_fabrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
#
###############################################################################

from unittest.mock import MagicMock

import pytest

from nodescraper.enums.executionstatus import ExecutionStatus
from nodescraper.enums.systeminteraction import SystemInteractionLevel
from nodescraper.plugins.inband.fabrics.fabrics_collector import FabricsCollector
from nodescraper.plugins.inband.fabrics.fabricsdata import (
Expand Down Expand Up @@ -312,3 +315,54 @@ def test_fabrics_data_model_empty(collector):
assert len(data.ibdev_netdev_mappings) == 0
assert data.ofed_info is None
assert data.mst_status is None


def test_collect_data_detects_slingshot_when_no_ib(collector):
"""When IB is absent but Cassini is present, collect Slingshot command outputs."""

def run_sut_cmd_side_effect(cmd, *args, **kwargs):
responses = {
"ibstat": MagicMock(exit_code=1, stdout="", command=cmd),
"ibv_devinfo": MagicMock(exit_code=1, stdout="", command=cmd),
"ls -l /sys/class/infiniband/*/device/net": MagicMock(
exit_code=1, stdout="", command=cmd
),
"ofed_info -s": MagicMock(exit_code=1, stdout="", command=cmd),
"mst start": MagicMock(exit_code=1, stdout="", command=cmd),
"mst status -v": MagicMock(exit_code=1, stdout="", command=cmd),
"lspci | grep -i cassini": MagicMock(
exit_code=0,
stdout="03:00.0 Processing accelerators: Vendor Cassini",
command=cmd,
),
"ip link show": MagicMock(exit_code=0, stdout="1: lo: <LOOPBACK>", command=cmd),
"fi_info -p cxi": MagicMock(exit_code=0, stdout="provider: cxi", command=cmd),
"cxi_stat": MagicMock(exit_code=0, stdout="cxi stats output", command=cmd),
"lsmod | grep cxi": MagicMock(exit_code=0, stdout="cxi_core 123 0", command=cmd),
}
return responses[cmd]

collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect)

result, data = collector.collect_data()

assert result.status == ExecutionStatus.OK
assert data is not None
assert data.slingshot_data is not None
assert "Cassini" in data.slingshot_data.cassini_pci
assert data.slingshot_data.libfabric_info == "provider: cxi"
assert data.slingshot_data.cxi_stat == "cxi stats output"


def test_collect_data_not_ran_when_no_ib_and_no_slingshot(collector):
"""Return NOT_RAN when neither IB nor Slingshot hardware is present."""

def run_sut_cmd_side_effect(cmd, *args, **kwargs):
return MagicMock(exit_code=1, stdout="", command=cmd)

collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect)

result, data = collector.collect_data()

assert result.status == ExecutionStatus.NOT_RAN
assert data is None
Loading