Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,8 +661,9 @@ def check_expected_xgmi_link_speed(
if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
self._log_event(
category=EventCategory.IO,
description="Expected XGMI speed not configured, skipping XGMI link speed check",
priority=EventPriority.WARNING,
description=("Expected XGMI link speed not set; skipping XGMI link speed analysis"),
priority=EventPriority.INFO,
console_log=True,
)
return

Expand Down
5 changes: 5 additions & 0 deletions nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
ValueUnit,
XgmiLinks,
XgmiMetrics,
build_amd_smi_analysis_ref,
)
from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs
from nodescraper.utils import get_exception_traceback
Expand Down Expand Up @@ -475,6 +476,9 @@ def _get_amdsmi_data(
return None

try:
analysis_ref = build_amd_smi_analysis_ref(
statics, processes, partition, firmware, xgmi_metric
)
return AmdSmiDataModel(
version=version,
gpu_list=gpu_list,
Expand All @@ -489,6 +493,7 @@ def _get_amdsmi_data(
xgmi_link=xgmi_link or [],
cper_data=cper_data,
cper_afids=cper_afids,
analysis_ref=analysis_ref,
)
except ValidationError as err:
self.logger.warning("Validation err: %s", err)
Expand Down
125 changes: 125 additions & 0 deletions nodescraper/plugins/inband/amdsmi/amdsmidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,25 @@ class Topo(BaseModel):
links: list[TopoLink]


class AmdSmiAnalysisRef(BaseModel):
"""Collector-filled summary for reference config"""

model_config = ConfigDict(extra="forbid")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont think this is needed


gpu_processes_max: Optional[int] = None
max_power_w: Optional[int] = None
amdgpu_drv_version: Optional[str] = None
mem_part_mode: Optional[str] = None
compute_part_mode: Optional[str] = None
pldm_version: Optional[str] = None
ep_vendor_id: Optional[str] = None
ep_subvendor_id: Optional[str] = None
ep_device_id: Optional[str] = None
ep_subsystem_id: Optional[str] = None
ep_market_name: Optional[str] = None
xgmi_rates: Optional[list[float]] = None


class AmdSmiDataModel(DataModel):
"""Data model for amd-smi data.

Expand Down Expand Up @@ -957,6 +976,8 @@ class AmdSmiDataModel(DataModel):
cper_data: Optional[list[FileModel]] = Field(default_factory=list)
cper_afids: dict[str, int] = Field(default_factory=dict)

analysis_ref: Optional[AmdSmiAnalysisRef] = None

def get_list(self, gpu: int) -> Optional[AmdSmiListItem]:
"""Get the gpu list item for the given gpu id."""
if self.gpu_list is None:
Expand Down Expand Up @@ -1001,3 +1022,107 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]:
if item.gpu == gpu:
return item
return None


_PLDM_FW_ID = "PLDM_BUNDLE"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@graepaul is this something we can move into an analyzer_arg? will this value ever change?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use {fw_id: fw_verrsion} map as an analyzer arg to provide more flexibility to the users

The users should then provide a dict where the keys are fw_ids and the values are the fw_version



def build_amd_smi_analysis_ref(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would make more sense as a method in the AmdSmiDataModel

static: Optional[list[AmdSmiStatic]],
process: Optional[list[Processes]],
partition: Optional[Partition],
firmware: Optional[list[Fw]],
xgmi_metric: Optional[list[XgmiMetrics]],
) -> AmdSmiAnalysisRef:
"""Build analysis summary from collected structures (called by AmdSmiCollector)."""
static = static or []

gpu_processes_max: Optional[int] = None
if process:
counts: list[int] = []
for proc in process:
if not proc.process_list:
continue
if isinstance(proc.process_list[0].process_info, str):
continue
counts.append(len(proc.process_list))
if counts:
gpu_processes_max = max(counts)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could these just be independent properties in the AmdSmiDataModel


max_power_w: Optional[int] = None
for gpu in sorted(static, key=lambda s: s.gpu):
lim = gpu.limit
if lim is None or lim.max_power is None or lim.max_power.value is None:
continue
try:
max_power_w = int(float(lim.max_power.value))
break
except (TypeError, ValueError):
continue

amdgpu_drv_version: Optional[str] = None
for gpu in sorted(static, key=lambda s: s.gpu):
if gpu.driver and gpu.driver.version:
amdgpu_drv_version = gpu.driver.version
break

mem_part_mode: Optional[str] = None
compute_part_mode: Optional[str] = None
if partition:
mps = partition.memory_partition
if mps:
mem_part_mode = sorted(mps, key=lambda p: p.gpu_id)[0].partition_type
cps = partition.compute_partition
if cps:
compute_part_mode = sorted(cps, key=lambda p: p.gpu_id)[0].partition_type

pldm_version: Optional[str] = None
if firmware:
for fw in sorted(firmware, key=lambda f: f.gpu):
if isinstance(fw.fw_list, str):
continue
for item in fw.fw_list:
if item.fw_id == _PLDM_FW_ID:
pldm_version = item.fw_version
break
if pldm_version is not None:
break

ep_vendor_id = ep_subvendor_id = ep_device_id = ep_subsystem_id = ep_market_name = None
if static:
first = sorted(static, key=lambda s: s.gpu)[0]
asic = first.asic
ep_vendor_id = asic.vendor_id
ep_subvendor_id = asic.subvendor_id
ep_device_id = asic.device_id
ep_subsystem_id = asic.subsystem_id
ep_market_name = asic.market_name

xgmi_rates: Optional[list[float]] = None
if xgmi_metric:
rates: set[float] = set()
for xm in xgmi_metric:
br = xm.link_metrics.bit_rate
if br is None or br.value is None:
continue
try:
rates.add(float(br.value))
except (TypeError, ValueError):
continue
if rates:
xgmi_rates = sorted(rates)

return AmdSmiAnalysisRef(
gpu_processes_max=gpu_processes_max,
max_power_w=max_power_w,
amdgpu_drv_version=amdgpu_drv_version,
mem_part_mode=mem_part_mode,
compute_part_mode=compute_part_mode,
pldm_version=pldm_version,
ep_vendor_id=ep_vendor_id,
ep_subvendor_id=ep_subvendor_id,
ep_device_id=ep_device_id,
ep_subsystem_id=ep_subsystem_id,
ep_market_name=ep_market_name,
xgmi_rates=xgmi_rates,
)
29 changes: 29 additions & 0 deletions nodescraper/plugins/inband/amdsmi/analyzer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from pydantic import Field

from nodescraper.models import AnalyzerArgs
from nodescraper.plugins.inband.amdsmi.amdsmidata import AmdSmiDataModel


class AmdSmiAnalyzerArgs(AnalyzerArgs):
Expand Down Expand Up @@ -80,3 +81,31 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs):
analysis_range_end: Optional[datetime] = Field(
default=None, description="End of time range for time-windowed analysis."
)

@classmethod
def build_from_model(cls, datamodel: AmdSmiDataModel) -> "AmdSmiAnalyzerArgs":
"""Build analyzer args from data model (reference snapshot set by collector).

Args:
datamodel (AmdSmiDataModel): data model for plugin

Returns:
AmdSmiAnalyzerArgs: instance of analyzer args class
"""
r = datamodel.analysis_ref
if r is None:
return cls()
return cls(
expected_gpu_processes=r.gpu_processes_max,
expected_max_power=r.max_power_w,
expected_driver_version=r.amdgpu_drv_version,
expected_memory_partition_mode=r.mem_part_mode,
expected_compute_partition_mode=r.compute_part_mode,
expected_pldm_version=r.pldm_version,
vendorid_ep=r.ep_vendor_id,
vendorid_ep_vf=r.ep_subvendor_id,
devid_ep=r.ep_device_id,
devid_ep_vf=r.ep_subsystem_id,
sku_name=r.ep_market_name,
expected_xgmi_speed=r.xgmi_rates,
)
Loading