diff --git a/nodescraper/plugins/inband/fabrics/fabrics_collector.py b/nodescraper/plugins/inband/fabrics/fabrics_collector.py index 48eef06..59c884d 100644 --- a/nodescraper/plugins/inband/fabrics/fabrics_collector.py +++ b/nodescraper/plugins/inband/fabrics/fabrics_collector.py @@ -38,6 +38,7 @@ MstDevice, MstStatus, OfedInfo, + SlingshotData, ) @@ -51,6 +52,11 @@ class FabricsCollector(InBandDataCollector[FabricsDataModel, None]): CMD_OFED_INFO = "ofed_info -s" CMD_MST_START = "mst start" CMD_MST_STATUS = "mst status -v" + CMD_CASSINI_PCI = "lspci | grep -i cassini" + CMD_NET_LINK = "ip link show" + CMD_LIBFABRIC_INFO = "fi_info -p cxi" + CMD_CXI_STAT = "cxi_stat" + CMD_CXI_MODULES = "lsmod | grep cxi" def _parse_ibstat(self, output: str) -> List[IbstatDevice]: """Parse 'ibstat' output into IbstatDevice objects. @@ -406,6 +412,7 @@ def collect_data( ibdev_netdev_mappings = [] ofed_info = None mst_status = None + slingshot_data = None # Collect ibstat information res_ibstat = self._run_sut_cmd(self.CMD_IBSTAT) @@ -522,24 +529,90 @@ def collect_data( priority=EventPriority.INFO, ) + # Slingshot fallback path: + # if no InfiniBand data was collected, probe for Cassini/CXI fabric. + ib_data_collected = bool(ibstat_devices or ibv_devices or ibdev_netdev_mappings) + if not ib_data_collected: + res_cassini = self._run_sut_cmd(self.CMD_CASSINI_PCI) + cassini_detected = res_cassini.exit_code == 0 and bool(res_cassini.stdout.strip()) + + if cassini_detected: + self._log_event( + category=EventCategory.NETWORK, + description="Detected Slingshot/Cassini fabrics hardware", + priority=EventPriority.INFO, + ) + + res_net_link = self._run_sut_cmd(self.CMD_NET_LINK) + res_libfabric = self._run_sut_cmd(self.CMD_LIBFABRIC_INFO) + res_cxi_stat = self._run_sut_cmd(self.CMD_CXI_STAT) + res_cxi_modules = self._run_sut_cmd(self.CMD_CXI_MODULES) + + slingshot_data = SlingshotData( + cassini_pci=res_cassini.stdout, + net_link=res_net_link.stdout if res_net_link.exit_code == 0 else None, + libfabric_info=res_libfabric.stdout if res_libfabric.exit_code == 0 else None, + cxi_stat=res_cxi_stat.stdout if res_cxi_stat.exit_code == 0 else None, + cxi_modules=res_cxi_modules.stdout if res_cxi_modules.exit_code == 0 else None, + ) + + failed_cmds = [] + for cmd_name, cmd_res in ( + ("ip link show", res_net_link), + ("fi_info -p cxi", res_libfabric), + ("cxi_stat", res_cxi_stat), + ("lsmod | grep cxi", res_cxi_modules), + ): + if cmd_res.exit_code != 0: + failed_cmds.append(cmd_name) + + if failed_cmds: + self._log_event( + category=EventCategory.NETWORK, + description="Some Slingshot commands failed", + data={"failed_commands": failed_cmds}, + priority=EventPriority.WARNING, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description="No Slingshot/Cassini hardware detected on this system", + data={ + "command": res_cassini.command, + "exit_code": res_cassini.exit_code, + }, + priority=EventPriority.INFO, + ) + # Build the data model only if we collected any data - if ibstat_devices or ibv_devices or ibdev_netdev_mappings or ofed_info or mst_status: + if ( + ibstat_devices + or ibv_devices + or ibdev_netdev_mappings + or ofed_info + or mst_status + or slingshot_data + ): fabrics_data = FabricsDataModel( ibstat_devices=ibstat_devices, ibv_devices=ibv_devices, ibdev_netdev_mappings=ibdev_netdev_mappings, ofed_info=ofed_info, mst_status=mst_status, + slingshot_data=slingshot_data, ) self.result.message = ( f"Collected fabrics data: {len(ibstat_devices)} ibstat devices, " f"{len(ibv_devices)} ibv devices, {len(ibdev_netdev_mappings)} mappings, " f"OFED: {ofed_info.version if ofed_info else 'N/A'}, " - f"MST devices: {len(mst_status.devices) if mst_status else 0}" + f"MST devices: {len(mst_status.devices) if mst_status else 0}, " + f"Slingshot: {'detected' if slingshot_data else 'not detected'}" ) self.result.status = ExecutionStatus.OK return self.result, fabrics_data else: - self.result.message = "No InfiniBand/RDMA fabrics hardware detected on this system" + self.result.message = ( + "No InfiniBand/RDMA or Slingshot fabrics hardware detected on this system" + ) self.result.status = ExecutionStatus.NOT_RAN return self.result, None diff --git a/nodescraper/plugins/inband/fabrics/fabricsdata.py b/nodescraper/plugins/inband/fabrics/fabricsdata.py index 6f53798..c74b0dd 100644 --- a/nodescraper/plugins/inband/fabrics/fabricsdata.py +++ b/nodescraper/plugins/inband/fabrics/fabricsdata.py @@ -96,6 +96,16 @@ class MstStatus(BaseModel): raw_output: str = "" # Raw command output +class SlingshotData(BaseModel): + """Slingshot/Cassini fabrics command outputs""" + + cassini_pci: str = "" # Output of lspci Cassini probe + net_link: Optional[str] = None # Output of ip link show + libfabric_info: Optional[str] = None # Output of fi_info -p cxi + cxi_stat: Optional[str] = None # Output of cxi_stat + cxi_modules: Optional[str] = None # Output of lsmod | grep cxi + + class FabricsDataModel(DataModel): """Complete InfiniBand/RDMA fabrics configuration data""" @@ -106,3 +116,4 @@ class FabricsDataModel(DataModel): ) # ibdev2netdev output ofed_info: Optional[OfedInfo] = None # OFED version info mst_status: Optional[MstStatus] = None # MST status + slingshot_data: Optional[SlingshotData] = None # Slingshot/Cassini command outputs diff --git a/test/unit/plugin/test_fabrics_collector.py b/test/unit/plugin/test_fabrics_collector.py index a24f73b..add8bb5 100644 --- a/test/unit/plugin/test_fabrics_collector.py +++ b/test/unit/plugin/test_fabrics_collector.py @@ -24,8 +24,11 @@ # ############################################################################### +from unittest.mock import MagicMock + import pytest +from nodescraper.enums.executionstatus import ExecutionStatus from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.plugins.inband.fabrics.fabrics_collector import FabricsCollector from nodescraper.plugins.inband.fabrics.fabricsdata import ( @@ -312,3 +315,54 @@ def test_fabrics_data_model_empty(collector): assert len(data.ibdev_netdev_mappings) == 0 assert data.ofed_info is None assert data.mst_status is None + + +def test_collect_data_detects_slingshot_when_no_ib(collector): + """When IB is absent but Cassini is present, collect Slingshot command outputs.""" + + def run_sut_cmd_side_effect(cmd, *args, **kwargs): + responses = { + "ibstat": MagicMock(exit_code=1, stdout="", command=cmd), + "ibv_devinfo": MagicMock(exit_code=1, stdout="", command=cmd), + "ls -l /sys/class/infiniband/*/device/net": MagicMock( + exit_code=1, stdout="", command=cmd + ), + "ofed_info -s": MagicMock(exit_code=1, stdout="", command=cmd), + "mst start": MagicMock(exit_code=1, stdout="", command=cmd), + "mst status -v": MagicMock(exit_code=1, stdout="", command=cmd), + "lspci | grep -i cassini": MagicMock( + exit_code=0, + stdout="03:00.0 Processing accelerators: Vendor Cassini", + command=cmd, + ), + "ip link show": MagicMock(exit_code=0, stdout="1: lo: ", command=cmd), + "fi_info -p cxi": MagicMock(exit_code=0, stdout="provider: cxi", command=cmd), + "cxi_stat": MagicMock(exit_code=0, stdout="cxi stats output", command=cmd), + "lsmod | grep cxi": MagicMock(exit_code=0, stdout="cxi_core 123 0", command=cmd), + } + return responses[cmd] + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.slingshot_data is not None + assert "Cassini" in data.slingshot_data.cassini_pci + assert data.slingshot_data.libfabric_info == "provider: cxi" + assert data.slingshot_data.cxi_stat == "cxi stats output" + + +def test_collect_data_not_ran_when_no_ib_and_no_slingshot(collector): + """Return NOT_RAN when neither IB nor Slingshot hardware is present.""" + + def run_sut_cmd_side_effect(cmd, *args, **kwargs): + return MagicMock(exit_code=1, stdout="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.NOT_RAN + assert data is None