diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6cbd74c2..fe9fc1c5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @landrews-amd @alexandraBara +* @landrews-amd @alexandraBara @jaspals3123 diff --git a/.github/workflows/code_quality_checks.yml b/.github/workflows/code_quality_checks.yml index 27211437..e95e8769 100644 --- a/.github/workflows/code_quality_checks.yml +++ b/.github/workflows/code_quality_checks.yml @@ -4,20 +4,21 @@ name: Code Quality Check permissions: contents: read -on: [pull_request] +on: + - pull_request + - workflow_dispatch jobs: pre-commit: runs-on: [ self-hosted ] - container: python:3.10 + container: python:3.9 steps: - uses: actions/checkout@v3 - - name: setup environment - run: | - ./dev-setup.sh - - name: run pre-commit hooks + - name: setup environment and run pre-commit hooks + shell: bash run: | + source ./dev-setup.sh pre-commit run --all-files --show-diff-on-failure --color=always - name: Print message on failure if: failure() diff --git a/.github/workflows/functional-test.yml b/.github/workflows/functional-test.yml new file mode 100644 index 00000000..8fd1fcf4 --- /dev/null +++ b/.github/workflows/functional-test.yml @@ -0,0 +1,30 @@ +name: Python Functional Tests + +on: + workflow_dispatch: + pull_request: + push: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + run_tests: + runs-on: [ self-hosted ] + container: python:3.9 + + steps: + - uses: actions/checkout@v3 + + - name: Install xmllint + run: | + apt-get update + apt-get install -y libxml2-utils bc + + - name: Install package and run functional tests + id: run_functional_tests + shell: bash + run: | + source ./dev-setup.sh + pytest test/functional -s --disable-warnings -v diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 9396c24a..7a4b17c7 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -12,24 +12,21 @@ permissions: jobs: run_tests: runs-on: [ self-hosted ] - container: python:3.10 + container: python:3.9 steps: - uses: actions/checkout@v3 - - name: Install package - run: | - ./dev-setup.sh - - name: Install xmllint run: | apt-get update apt-get install -y libxml2-utils bc - - - name: Run unit tests with coverage + - name: Install package and run unit tests with coverage id: extract_coverage + shell: bash run: | + source ./dev-setup.sh pytest test/unit -s --cov=nodescraper --cov-report=xml --cov-report=term --cov-fail-under=70 --maxfail=1 --disable-warnings -v - name: Print coverage diff --git a/.github/workflows/update-plugin-docs.yml b/.github/workflows/update-plugin-docs.yml new file mode 100644 index 00000000..2778efb1 --- /dev/null +++ b/.github/workflows/update-plugin-docs.yml @@ -0,0 +1,62 @@ +# Workflow to run plugin documentation generation then create a PR with the updated changes + +name: Plugin Documentation Generator + +permissions: + contents: write + pull-requests: write + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' + +jobs: + generate_docs: + runs-on: [ self-hosted ] + # To disable this workflow, set DISABLE_AUTO_DOCS to 'true' in repository variables + if: vars.DISABLE_AUTO_DOCS != 'true' + env: + HOME: /tmp/github-actions-home + + steps: + - name: Setup HOME directory + run: | + mkdir -p /tmp/github-actions-home + export HOME=/tmp/github-actions-home + + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install dependencies + run: ./dev-setup.sh + + - name: Run plugin documentation generator + run: | + source venv/bin/activate + python docs/generate_plugin_doc_bundle.py \ + --package nodescraper.plugins.inband \ + --output docs/PLUGIN_DOC.md + + - name: Format documentation with pre-commit + run: | + source venv/bin/activate + pre-commit run --files docs/PLUGIN_DOC.md || true + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "docs: Update plugin documentation [automated]" + committer: "github-actions[bot] " + author: "github-actions[bot] " + branch: automated-plugin-docs-update + delete-branch: true + title: "docs: Update plugin documentation [automated]" + body: | + Automated plugin documentation update generated by workflow. + + This PR was automatically created by the Plugin Documentation Generator workflow. + labels: documentation,automated diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f95d955e..de74bbae 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,8 +52,41 @@ When creating a PR, use the following process. > By creating a PR, you agree to allow your contribution to be licensed under the > terms of the LICENSE.txt file. -### Documentation +### Pre-commit hooks -Submit Node Scraper documentation changes to our -[documentation](https://github.com/amd/node-scraper/blob/development/README.md). You must update -documentation related to any new feature or API contribution. +This repository uses [pre-commit](https://pre-commit.com/) to automatically format code. When you commit changes to plugin files, the hooks will: + +1. Run code formatters (ruff, black) +2. Run type checking (mypy) + +#### Setup + +Install pre-commit hooks after cloning the repository: + +```bash +# Activate your virtual environment +source venv/bin/activate + +# Install pre-commit hooks +pre-commit install +``` + +#### Usage + +The hooks run automatically when you commit. + +```bash +# First commit attempt - hooks run and may modify files +git commit -m "Add new plugin feature" + +# If hooks modified files, stage them and commit again +git add . +git commit -m "Add new plugin feature" +``` + +You can also run hooks manually: + +```bash +# Run all hooks on all files +pre-commit run --all-files +``` diff --git a/README.md b/README.md index f0b15270..c020fdd1 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,27 @@ a python virtual environment and also configures the pre-commit hooks for the pr source dev-setup.sh ``` +Alternatively, follow these manual steps: + +### 1. Virtual Environment (Optional) +```sh +python3 -m venv venv +source venv/bin/activate +``` +On Debian/Ubuntu, you may need: `sudo apt install python3-venv` + +### 2. Install from Source (Required) +```sh +python3 -m pip install --editable .[dev] --upgrade +``` +This installs Node Scraper in editable mode with development dependencies. To verify: `node-scraper --help` + +### 3. Git Hooks (Optional) +```sh +pre-commit install +``` +Sets up pre-commit hooks for code quality checks. On Debian/Ubuntu, you may need: `sudo apt install pre-commit` + ## CLI Usage The Node Scraper CLI can be used to run Node Scraper plugins on a target system. The following CLI options are available: diff --git a/dev-setup.sh b/dev-setup.sh index 7cafc606..71b78c3f 100755 --- a/dev-setup.sh +++ b/dev-setup.sh @@ -1,6 +1,7 @@ +#!/usr/bin/env bash + # Create venv if not already present if [ ! -d "venv" ]; then - python3 -m pip install venv python3 -m venv venv fi diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index af8e860c..ffe6a7b8 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -2,29 +2,71 @@ # Plugin Table -| Plugin | DataModel | Collector | Analyzer | AnalyzerArgs | Cmd(s) | +| Plugin | Collection | Analysis | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | -| nodescraper.plugins.inband.bios.bios_plugin.BiosPlugin | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | BiosAnalyzerArgs | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | -| nodescraper.plugins.inband.cmdline.cmdline_plugin.CmdlinePlugin | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | CmdlineAnalyzerArgs | cat /proc/cmdline | -| nodescraper.plugins.inband.dimm.dimm_plugin.DimmPlugin | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - | - | sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null
wmic memorychip get Capacity | -| nodescraper.plugins.inband.dkms.dkms_plugin.DkmsPlugin | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) | DkmsAnalyzerArgs | dkms status
dkms --version | -| nodescraper.plugins.inband.dmesg.dmesg_plugin.DmesgPlugin | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) | - | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null | grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' || true | -| nodescraper.plugins.inband.journal.journal_plugin.JournalPlugin | [JournalData](#JournalData-Model) | [JournalCollector](#Collector-Class-JournalCollector) | - | - | journalctl --no-pager --system --output=short-iso | -| nodescraper.plugins.inband.kernel.kernel_plugin.KernelPlugin | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) | KernelAnalyzerArgs | sh -c 'uname -r'
wmic os get Version /Value | -| nodescraper.plugins.inband.kernel_module.kernel_module_plugin.KernelModulePlugin | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | KernelModuleAnalyzerArgs | cat /proc/modules
wmic os get Version /Value | -| nodescraper.plugins.inband.memory.memory_plugin.MemoryPlugin | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | - | free -b
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | -| nodescraper.plugins.inband.nvme.nvme_plugin.NvmePlugin | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - | - | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name} | -| nodescraper.plugins.inband.os.os_plugin.OsPlugin | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | OsAnalyzerArgs | sh -c '( lsb_release -ds || (cat /etc/*release | grep PRETTY_NAME) || uname -om ) 2>/dev/null | head -n1'
cat /etc/*release | grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | -| nodescraper.plugins.inband.package.package_plugin.PackagePlugin | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) | PackageAnalyzerArgs | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | -| nodescraper.plugins.inband.process.process_plugin.ProcessPlugin | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | ProcessAnalyzerArgs | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | -| nodescraper.plugins.inband.rocm.rocm_plugin.RocmPlugin | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | RocmAnalyzerArgs | /opt/rocm/.info/version-rocm
/opt/rocm/.info/version | -| nodescraper.plugins.inband.storage.storage_plugin.StoragePlugin | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | - | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | -| nodescraper.plugins.inband.sysctl.sysctl_plugin.SysctlPlugin | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) | SysctlAnalyzerArgs | sysctl -n | -| nodescraper.plugins.inband.syslog.syslog_plugin.SyslogPlugin | [SyslogData](#SyslogData-Model) | [SyslogCollector](#Collector-Class-SyslogCollector) | - | - | ls -1 /var/log/syslog* 2>/dev/null | grep -E '^/var/log/syslog(\.[0-9]+(\.gz)?)?$' || true | -| nodescraper.plugins.inband.uptime.uptime_plugin.UptimePlugin | [UptimeDataModel](#UptimeDataModel-Model) | [UptimeCollector](#Collector-Class-UptimeCollector) | - | - | uptime | +| AmdSmiPlugin | firmware --json
list --json
partition --json
process --json
ras --cper --folder={folder}
static -g all --json
static -g {gpu_id} --json
version --json | **Analyzer Args:**
- `check_static_data`: bool
- `expected_gpu_processes`: Optional[int]
- `expected_max_power`: Optional[int]
- `expected_driver_version`: Optional[str]
- `expected_memory_partition_mode`: Optional[str]
- `expected_compute_partition_mode`: Optional[str]
- `expected_pldm_version`: Optional[str]
- `l0_to_recovery_count_error_threshold`: Optional[int]
- `l0_to_recovery_count_warning_threshold`: Optional[int]
- `vendorid_ep`: Optional[str]
- `vendorid_ep_vf`: Optional[str]
- `devid_ep`: Optional[str]
- `devid_ep_vf`: Optional[str]
- `sku_name`: Optional[str]
- `expected_xgmi_speed`: Optional[list[float]]
- `analysis_range_start`: Optional[datetime.datetime]
- `analysis_range_end`: Optional[datetime.datetime] | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | +| BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str]
- `regex_match`: bool | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | +| CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, list]
- `banned_cmdline`: Union[str, list] | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | +| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor \| Measure-Object).Count"
lspci -d {vendorid_ep}: \| grep -i 'VGA\\|Display\\|3D' \| wc -l
powershell -Command "(wmic path win32_VideoController get name \| findstr AMD \| Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: \| grep -i 'Virtual Function' \| wc -l
powershell -Command "(Get-VMHostPartitionableGpu \| Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]]
- `gpu_count`: Optional[list[int]]
- `vf_count`: Optional[list[int]] | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | +| DimmPlugin | sh -c 'dmidecode -t 17 \| tr -s " " \| grep -v "Volatile\\|None\\|Module" \| grep Size' 2>/dev/null
dmidecode
wmic memorychip get Capacity | - | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - | +| DkmsPlugin | dkms status
dkms --version | **Analyzer Args:**
- `dkms_status`: Union[str, list]
- `dkms_version`: Union[str, list]
- `regex_match`: bool | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) | +| DmesgPlugin | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null \| grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' \|\| true | **Built-in Regexes:**
- Out of memory error: `(?:oom_kill_process.*)\|(?:Out of memory.*)`
- I/O Page Fault: `IO_PAGE_FAULT`
- Kernel Panic: `\bkernel panic\b.*`
- SQ Interrupt: `sq_intr`
- SRAM ECC: `sram_ecc.*`
- Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*`
- Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*`
- Real Time throttling activated: `sched: RT throttling activated.*`
- RCU preempt detected stalls: `rcu_preempt detected stalls.*`
- RCU preempt self-detected stall: `rcu_preempt self-detected stall.*`
- QCM fence timeout: `qcm fence wait loop timeout.*`
- General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...`
- Segmentation fault: `(?:segfault.*in .*\[)\|(?:[Ss]egmentation [Ff]au...`
- Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*`
- Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*`
- Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*`
- amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...`
- Page Fault: `page fault for address.*`
- Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)\|(Fata...`
- PCIe AER Error: `(?:pcieport )(.*AER: aer_status.*)\|(aer_status.*)`
- Failed to read journal file: `Failed to read journal file.*`
- Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*`
- ACPI BIOS Error: `ACPI BIOS Error`
- ACPI Error: `ACPI Error`
- Filesystem corrupted!: `EXT4-fs error \(device .*\):`
- Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)`
- PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...`
- RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...`
- GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}`
- Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...`
- RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- SGX Error: `x86/cpu: SGX disabled by BIOS`
- GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...`
- LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...`
- LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...`
- Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) | +| JournalPlugin | journalctl --no-pager --system --output=short-iso | - | [JournalData](#JournalData-Model) | [JournalCollector](#Collector-Class-JournalCollector) | - | +| KernelPlugin | sh -c 'uname -a'
wmic os get Version /Value | **Analyzer Args:**
- `exp_kernel`: Union[str, list]
- `regex_match`: bool | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) | +| KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict]
- `regex_filter`: list[str] | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | +| MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float
- `memory_threshold`: str | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | +| NetworkPlugin | ip addr show
sudo ethtool {interface}
ip neighbor show
ip route show
ip rule show | - | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | - | +| NvmePlugin | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name} | - | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - | +| OsPlugin | sh -c '( lsb_release -ds \|\| (cat /etc/*release \| grep PRETTY_NAME) \|\| uname -om ) 2>/dev/null \| head -n1'
cat /etc/*release \| grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list]
- `exact_match`: bool | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | +| PackagePlugin | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | **Analyzer Args:**
- `exp_package_ver`: Dict[str, Optional[str]]
- `regex_match`: bool
- `rocm_regex`: Optional[str]
- `enable_rocm_regex`: bool | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) | +| PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int
- `exp_width`: int
- `exp_sriov_count`: int
- `exp_gpu_count_override`: Optional[int]
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType]
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType]
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | +| ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int
- `max_cpu_usage`: float | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | +| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env \| grep -Ei 'rocm\|hsa\|hip\|mpi\|openmp\|ucx\|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d /opt/rocm*
ls -v -d /opt/rocm-[3-7]* \| tail -1
ldconfig -p \| grep -i -E 'rocm'
/opt/rocm/.info/version-rocm
/opt/rocm/.info/version | **Analyzer Args:**
- `exp_rocm`: Union[str, list]
- `exp_rocm_latest`: str | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | +| StoragePlugin | sh -c 'df -lH -B1 \| grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | +| SysctlPlugin | sysctl -n | **Analyzer Args:**
- `exp_vm_swappiness`: Optional[int]
- `exp_vm_numa_balancing`: Optional[int]
- `exp_vm_oom_kill_allocating_task`: Optional[int]
- `exp_vm_compaction_proactiveness`: Optional[int]
- `exp_vm_compact_unevictable_allowed`: Optional[int]
- `exp_vm_extfrag_threshold`: Optional[int]
- `exp_vm_zone_reclaim_mode`: Optional[int]
- `exp_vm_dirty_background_ratio`: Optional[int]
- `exp_vm_dirty_ratio`: Optional[int]
- `exp_vm_dirty_writeback_centisecs`: Optional[int]
- `exp_kernel_numa_balancing`: Optional[int] | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) | +| SyslogPlugin | ls -1 /var/log/syslog* 2>/dev/null \| grep -E '^/var/log/syslog(\.[0-9]+(\.gz)?)?$' \|\| true | - | [SyslogData](#SyslogData-Model) | [SyslogCollector](#Collector-Class-SyslogCollector) | - | +| UptimePlugin | uptime | - | [UptimeDataModel](#UptimeDataModel-Model) | [UptimeCollector](#Collector-Class-UptimeCollector) | - | # Collectors +## Collector Class AmdSmiCollector + +### Description + +Class for collection of inband tool amd-smi data. + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [amdsmi_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py) + +### Class Variables + +- **AMD_SMI_EXE**: `amd-smi` +- **SUPPORTED_OS_FAMILY**: `{}` +- **CMD_VERSION**: `version --json` +- **CMD_LIST**: `list --json` +- **CMD_PROCESS**: `process --json` +- **CMD_PARTITION**: `partition --json` +- **CMD_FIRMWARE**: `firmware --json` +- **CMD_STATIC**: `static -g all --json` +- **CMD_STATIC_GPU**: `static -g {gpu_id} --json` +- **CMD_RAS**: `ras --cper --folder={folder}` + +### Provides Data + +AmdSmiDataModel + +### Commands + +- firmware --json +- list --json +- partition --json +- process --json +- ras --cper --folder={folder} +- static -g all --json +- static -g {gpu_id} --json +- version --json + ## Collector Class BiosCollector ### Description @@ -72,6 +114,40 @@ CmdlineDataModel - cat /proc/cmdline +## Collector Class DeviceEnumerationCollector + +### Description + +Collect CPU and GPU count + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [device_enumeration_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py) + +### Class Variables + +- **CMD_GPU_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l` +- **CMD_VF_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l` +- **CMD_LSCPU_LINUX**: `lscpu` +- **CMD_LSHW_LINUX**: `lshw` +- **CMD_CPU_COUNT_WINDOWS**: `powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"` +- **CMD_GPU_COUNT_WINDOWS**: `powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"` +- **CMD_VF_COUNT_WINDOWS**: `powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count"` + +### Provides Data + +DeviceEnumerationDataModel + +### Commands + +- powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count" +- lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l +- powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count" +- lscpu +- lshw +- lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l +- powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" + ## Collector Class DimmCollector ### Description @@ -86,6 +162,7 @@ Collect data on installed DIMMs - **CMD_WINDOWS**: `wmic memorychip get Capacity` - **CMD**: `sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null` +- **CMD_DMIDECODE_FULL**: `dmidecode` ### Provides Data @@ -94,6 +171,7 @@ DimmDataModel ### Commands - sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null +- dmidecode - wmic memorychip get Capacity ## Collector Class DkmsCollector @@ -182,7 +260,7 @@ Read kernel version ### Class Variables - **CMD_WINDOWS**: `wmic os get Version /Value` -- **CMD**: `sh -c 'uname -r'` +- **CMD**: `sh -c 'uname -a'` ### Provides Data @@ -190,7 +268,7 @@ KernelDataModel ### Commands -- sh -c 'uname -r' +- sh -c 'uname -a' - wmic os get Version /Value ## Collector Class KernelModuleCollector @@ -207,6 +285,7 @@ Read kernel modules and associated parameters - **CMD_WINDOWS**: `wmic os get Version /Value` - **CMD**: `cat /proc/modules` +- **CMD_MODINFO_AMDGPU**: `modinfo amdgpu` ### Provides Data @@ -215,6 +294,7 @@ KernelModuleDataModel ### Commands - cat /proc/modules +- modinfo amdgpu - wmic os get Version /Value ## Collector Class MemoryCollector @@ -231,6 +311,8 @@ Collect memory usage details - **CMD_WINDOWS**: `wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value` - **CMD**: `free -b` +- **CMD_LSMEM**: `lsmem` +- **CMD_NUMACTL**: `numactl -H` ### Provides Data @@ -239,8 +321,40 @@ MemoryDataModel ### Commands - free -b +- lsmem +- numactl -H - wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value +## Collector Class NetworkCollector + +### Description + +Collect network configuration details using ip command + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [network_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/network/network_collector.py) + +### Class Variables + +- **CMD_ADDR**: `ip addr show` +- **CMD_ROUTE**: `ip route show` +- **CMD_RULE**: `ip rule show` +- **CMD_NEIGHBOR**: `ip neighbor show` +- **CMD_ETHTOOL_TEMPLATE**: `sudo ethtool {interface}` + +### Provides Data + +NetworkDataModel + +### Commands + +- ip addr show +- sudo ethtool {interface} +- ip neighbor show +- ip route show +- ip rule show + ## Collector Class NvmeCollector ### Description @@ -254,7 +368,16 @@ Collect NVMe details from the system. ### Class Variables - **CMD_LINUX**: `{'smart_log': 'nvme smart-log {dev}', 'error_log': 'nvme error-log {dev} --log-entries=256', 'id_ctrl': 'nvme id-ctrl {dev}', 'id_ns': 'nvme id-ns {dev}{ns}', 'fw_log': 'nvme fw-log {dev}', 'self_test_log': 'nvme self-test-log {dev}', 'get_log': 'nvme get-log {dev} --log-id=6 --log-len=512', 'telemetry_log': 'nvme telemetry-log {dev} --output-file={dev}_{f_name}'}` -- **CMD_TEMPLATES**: `['nvme smart-log {dev}', 'nvme error-log {dev} --log-entries=256', 'nvme id-ctrl {dev}', 'nvme id-ns {dev}{ns}', 'nvme fw-log {dev}', 'nvme self-test-log {dev}', 'nvme get-log {dev} --log-id=6 --log-len=512', 'nvme telemetry-log {dev} --output-file={dev}_{f_name}']` +- **CMD_TEMPLATES**: `[ + nvme smart-log {dev}, + nvme error-log {dev} --log-entries=256, + nvme id-ctrl {dev}, + nvme id-ns {dev}{ns}, + nvme fw-log {dev}, + nvme self-test-log {dev}, + nvme get-log {dev} --log-id=6 --log-len=512, + nvme telemetry-log {dev} --output-file={dev}_{f_name} +]` - **TELEMETRY_FILENAME**: `telemetry_log.bin` ### Provides Data @@ -331,6 +454,60 @@ PackageDataModel - cat /etc/*release - wmic product get name,version +## Collector Class PcieCollector + +### Description + +class for collection of PCIe data only supports Linux OS type. + + This class collects the PCIE config space using the lspci hex dump and then parses the hex dump to get the + PCIe configuration space for the GPUs in the system. If the system interaction level is set to STANDARD or higher, + then the entire pcie configuration space is collected for the GPUs in the system. If the system interaction level + is set to SURFACE then, only the first 64 bytes of the pcie configuration space is collected for the GPUs in the system. + + This class will collect important PCIe data from the system running the commands + - `lspci -vvv` : Verbose collection of PCIe data + - `lspci -vvvt`: Verbose tree view of PCIe data + - `lspci -PP`: Path view of PCIe data for the GPUs + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - `lspci -xxxx`: Hex view of PCIe data for the GPUs + - otherwise the following commands will be run without sudo: + - `lspci -x`: Hex view of PCIe data for the GPUs + - `lspci -d :` : Count the number of GPUs in the system with this command + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - The sudo lspci -xxxx command is used to collect the PCIe configuration space for the GPUs in the system + - otherwise the following commands will be run without sudo: + - The lspci -x command is used to collect the PCIe configuration space for the GPUs in the system + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [pcie_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/pcie/pcie_collector.py) + +### Class Variables + +- **SUPPORTED_OS_FAMILY**: `{}` +- **CMD_LSPCI_VERBOSE**: `lspci -vvv` +- **CMD_LSPCI_VERBOSE_TREE**: `lspci -vvvt` +- **CMD_LSPCI_PATH**: `lspci -PP` +- **CMD_LSPCI_HEX_SUDO**: `lspci -xxxx` +- **CMD_LSPCI_HEX**: `lspci -x` +- **CMD_LSPCI_AMD_DEVICES**: `lspci -d {vendor_id}: -nn` +- **CMD_LSPCI_PATH_DEVICE**: `lspci -PP -d {vendor_id}:{dev_id}` + +### Provides Data + +PcieDataModel + +### Commands + +- lspci -d {vendor_id}: -nn +- lspci -x +- lspci -xxxx +- lspci -PP +- lspci -PP -d {vendor_id}:{dev_id} +- lspci -vvv +- lspci -vvvt + ## Collector Class ProcessCollector ### Description @@ -372,6 +549,14 @@ Collect ROCm version data - **SUPPORTED_OS_FAMILY**: `{}` - **CMD_VERSION_PATHS**: `['/opt/rocm/.info/version-rocm', '/opt/rocm/.info/version']` +- **CMD_ROCMINFO**: `{rocm_path}/bin/rocminfo` +- **CMD_ROCM_LATEST**: `ls -v -d /opt/rocm-[3-7]* | tail -1` +- **CMD_ROCM_DIRS**: `ls -v -d /opt/rocm*` +- **CMD_LD_CONF**: `grep -i -E 'rocm' /etc/ld.so.conf.d/*` +- **CMD_ROCM_LIBS**: `ldconfig -p | grep -i -E 'rocm'` +- **CMD_ENV_VARS**: `env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'` +- **CMD_CLINFO**: `{rocm_path}/opencl/bin/*/clinfo` +- **CMD_KFD_PROC**: `ls /sys/class/kfd/kfd/proc/` ### Provides Data @@ -379,6 +564,14 @@ RocmDataModel ### Commands +- {rocm_path}/opencl/bin/*/clinfo +- env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen' +- ls /sys/class/kfd/kfd/proc/ +- grep -i -E 'rocm' /etc/ld.so.conf.d/* +- {rocm_path}/bin/rocminfo +- ls -v -d /opt/rocm* +- ls -v -d /opt/rocm-[3-7]* | tail -1 +- ldconfig -p | grep -i -E 'rocm' - /opt/rocm/.info/version-rocm - /opt/rocm/.info/version @@ -476,6 +669,38 @@ UptimeDataModel # Data Models +## AmdSmiDataModel Model + +### Description + +Data model for amd-smi data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + +**Link to code**: [amdsmidata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/amdsmi/amdsmidata.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **version**: `Optional[nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiVersion]` +- **gpu_list**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiListItem]]` +- **partition**: `Optional[nodescraper.plugins.inband.amdsmi.amdsmidata.Partition]` +- **process**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.Processes]]` +- **topology**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.Topo]]` +- **firmware**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.Fw]]` +- **bad_pages**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.BadPages]]` +- **static**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiStatic]]` +- **metric**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiMetric]]` +- **xgmi_metric**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.XgmiMetrics]]` +- **xgmi_link**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.XgmiLinks]]` +- **cper_data**: `Optional[list[nodescraper.models.datamodel.FileModel]]` +- **amdsmitst_data**: `nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiTstData` + ## BiosDataModel Model **Link to code**: [biosdata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/bios/biosdata.py) @@ -484,7 +709,7 @@ UptimeDataModel ### Model annotations and fields -- **bios_version**: `` +- **bios_version**: `str` ## CmdlineDataModel Model @@ -494,7 +719,21 @@ UptimeDataModel ### Model annotations and fields -- **cmdline**: `` +- **cmdline**: `str` + +## DeviceEnumerationDataModel Model + +**Link to code**: [deviceenumdata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/device_enumeration/deviceenumdata.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **cpu_count**: `Optional[int]` +- **gpu_count**: `Optional[int]` +- **vf_count**: `Optional[int]` +- **lscpu_output**: `Optional[str]` +- **lshw_output**: `Optional[str]` ## DimmDataModel Model @@ -504,7 +743,7 @@ UptimeDataModel ### Model annotations and fields -- **dimms**: `` +- **dimms**: `str` ## DkmsDataModel Model @@ -514,8 +753,8 @@ UptimeDataModel ### Model annotations and fields -- **status**: `typing.Optional[str]` -- **version**: `typing.Optional[str]` +- **status**: `Optional[str]` +- **version**: `Optional[str]` ## DmesgData Model @@ -529,7 +768,7 @@ Data model for in band dmesg log ### Model annotations and fields -- **dmesg_content**: `` +- **dmesg_content**: `str` ## JournalData Model @@ -543,7 +782,7 @@ Data model for journal logs ### Model annotations and fields -- **journal_log**: `` +- **journal_log**: `str` ## KernelDataModel Model @@ -553,7 +792,8 @@ Data model for journal logs ### Model annotations and fields -- **kernel_version**: `` +- **kernel_info**: `str` +- **kernel_version**: `str` ## KernelModuleDataModel Model @@ -563,18 +803,43 @@ Data model for journal logs ### Model annotations and fields -- **kernel_modules**: `` +- **kernel_modules**: `dict` +- **amdgpu_modinfo**: `Optional[nodescraper.plugins.inband.kernel_module.kernel_module_data.ModuleInfo]` ## MemoryDataModel Model +### Description + +Memory data model + **Link to code**: [memorydata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/memory/memorydata.py) **Bases**: ['DataModel'] ### Model annotations and fields -- **mem_free**: `` -- **mem_total**: `` +- **mem_free**: `str` +- **mem_total**: `str` +- **lsmem_data**: `Optional[nodescraper.plugins.inband.memory.memorydata.LsmemData]` +- **numa_topology**: `Optional[nodescraper.plugins.inband.memory.memorydata.NumaTopology]` + +## NetworkDataModel Model + +### Description + +Complete network configuration data + +**Link to code**: [networkdata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/network/networkdata.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **interfaces**: `List[nodescraper.plugins.inband.network.networkdata.NetworkInterface]` +- **routes**: `List[nodescraper.plugins.inband.network.networkdata.Route]` +- **rules**: `List[nodescraper.plugins.inband.network.networkdata.RoutingRule]` +- **neighbors**: `List[nodescraper.plugins.inband.network.networkdata.Neighbor]` +- **ethtool_info**: `Dict[str, nodescraper.plugins.inband.network.networkdata.EthtoolInfo]` ## NvmeDataModel Model @@ -594,8 +859,8 @@ Data model for journal logs ### Model annotations and fields -- **os_name**: `` -- **os_version**: `` +- **os_name**: `str` +- **os_version**: `str` ## PackageDataModel Model @@ -610,6 +875,35 @@ Pacakge data contains the package data for the system ### Model annotations and fields - **version_info**: `dict[str, str]` +- **rocm_regex**: `str` +- **enable_rocm_regex**: `bool` + +## PcieDataModel Model + +### Description + +class for collection of PCIe data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + + - pcie_cfg_space: A dictionary of PCIe cfg space for the GPUs obtained with setpci command + - lspci_verbose: Verbose collection of PCIe data + - lspci_verbose_tree: Tree view of PCIe data + - lspci_path: Path view of PCIe data for the GPUs + - lspci_hex: Hex view of PCIe data for the GPUs + +**Link to code**: [pcie_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/pcie/pcie_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **pcie_cfg_space**: `Dict[Annotated[str, AfterValidator(func=validate_bdf)], nodescraper.plugins.inband.pcie.pcie_data.PcieCfgSpace]` +- **vf_pcie_cfg_space**: `Optional[Dict[Annotated[str, AfterValidator(func=validate_bdf)], nodescraper.plugins.inband.pcie.pcie_data.PcieCfgSpace]]` ## ProcessDataModel Model @@ -619,9 +913,9 @@ Pacakge data contains the package data for the system ### Model annotations and fields -- **kfd_process**: `typing.Optional[int]` -- **cpu_usage**: `typing.Optional[float]` -- **processes**: `typing.Optional[list[tuple[str, str]]]` +- **kfd_process**: `Optional[int]` +- **cpu_usage**: `Optional[float]` +- **processes**: `Optional[list[tuple[str, str]]]` ## RocmDataModel Model @@ -631,7 +925,15 @@ Pacakge data contains the package data for the system ### Model annotations and fields -- **rocm_version**: `` +- **rocm_version**: `str` +- **rocminfo**: `List[str]` +- **rocm_latest_versioned_path**: `str` +- **rocm_all_paths**: `List[str]` +- **ld_conf_rocm**: `List[str]` +- **rocm_libs**: `List[str]` +- **env_vars**: `List[str]` +- **clinfo**: `List[str]` +- **kfd_proc**: `List[str]` ## StorageDataModel Model @@ -651,17 +953,17 @@ Pacakge data contains the package data for the system ### Model annotations and fields -- **vm_swappiness**: `typing.Optional[int]` -- **vm_numa_balancing**: `typing.Optional[int]` -- **vm_oom_kill_allocating_task**: `typing.Optional[int]` -- **vm_compaction_proactiveness**: `typing.Optional[int]` -- **vm_compact_unevictable_allowed**: `typing.Optional[int]` -- **vm_extfrag_threshold**: `typing.Optional[int]` -- **vm_zone_reclaim_mode**: `typing.Optional[int]` -- **vm_dirty_background_ratio**: `typing.Optional[int]` -- **vm_dirty_ratio**: `typing.Optional[int]` -- **vm_dirty_writeback_centisecs**: `typing.Optional[int]` -- **kernel_numa_balancing**: `typing.Optional[int]` +- **vm_swappiness**: `Optional[int]` +- **vm_numa_balancing**: `Optional[int]` +- **vm_oom_kill_allocating_task**: `Optional[int]` +- **vm_compaction_proactiveness**: `Optional[int]` +- **vm_compact_unevictable_allowed**: `Optional[int]` +- **vm_extfrag_threshold**: `Optional[int]` +- **vm_zone_reclaim_mode**: `Optional[int]` +- **vm_dirty_background_ratio**: `Optional[int]` +- **vm_dirty_ratio**: `Optional[int]` +- **vm_dirty_writeback_centisecs**: `Optional[int]` +- **kernel_numa_balancing**: `Optional[int]` ## SyslogData Model @@ -685,11 +987,21 @@ Data model for in band syslog logs ### Model annotations and fields -- **current_time**: `` -- **uptime**: `` +- **current_time**: `str` +- **uptime**: `str` # Data Analyzers +## Data Analyzer Class AmdSmiAnalyzer + +### Description + +Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics + +**Bases**: ['CperAnalysisTaskMixin', 'DataAnalyzer'] + +**Link to code**: [amdsmi_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py) + ## Data Analyzer Class BiosAnalyzer ### Description @@ -710,6 +1022,17 @@ Check cmdline matches expected kernel cmdline **Link to code**: [cmdline_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/cmdline/cmdline_analyzer.py) +## Data Analyzer Class DeviceEnumerationAnalyzer + +### Description + +Check Device Enumeration matches expected cpu and gpu count + supported by all OSs, SKUs, and platforms. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [device_enumeration_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py) + ## Data Analyzer Class DkmsAnalyzer ### Description @@ -732,7 +1055,104 @@ Check dmesg for errors ### Class Variables -- **ERROR_REGEX**: `[ErrorRegex(regex=re.compile('(?:oom_kill_process.*)|(?:Out of memory.*)'), message='Out of memory error', event_category=, event_priority=), ErrorRegex(regex=re.compile('IO_PAGE_FAULT'), message='I/O Page Fault', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\bkernel panic\\b.*', re.IGNORECASE), message='Kernel Panic', event_category=, event_priority=), ErrorRegex(regex=re.compile('sq_intr'), message='SQ Interrupt', event_category=, event_priority=), ErrorRegex(regex=re.compile('sram_ecc.*'), message='SRAM ECC', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\[amdgpu\\]\\] \\*ERROR\\* hw_init of IP block.*'), message='Failed to load driver. IP hardware init error.', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\[amdgpu\\]\\] \\*ERROR\\* sw_init of IP block.*'), message='Failed to load driver. IP software init error.', event_category=, event_priority=), ErrorRegex(regex=re.compile('sched: RT throttling activated.*'), message='Real Time throttling activated', event_category=, event_priority=), ErrorRegex(regex=re.compile('rcu_preempt detected stalls.*'), message='RCU preempt detected stalls', event_category=, event_priority=), ErrorRegex(regex=re.compile('rcu_preempt self-detected stall.*'), message='RCU preempt self-detected stall', event_category=, event_priority=), ErrorRegex(regex=re.compile('qcm fence wait loop timeout.*'), message='QCM fence timeout', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:[\\w-]+(?:\\[[0-9.]+\\])?\\s+)?general protection fault[^\\n]*'), message='General protection fault', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:segfault.*in .*\\[)|(?:[Ss]egmentation [Ff]ault.*)|(?:[Ss]egfault.*)'), message='Segmentation fault', event_category=, event_priority=), ErrorRegex(regex=re.compile('amdgpu: Failed to disallow cf state.*'), message='Failed to disallow cf state', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\*ERROR\\* Failed to terminate tmr.*'), message='Failed to terminate tmr', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\*ERROR\\* suspend of IP block <\\w+> failed.*'), message='Suspend of IP block failed', event_category=, event_priority=), ErrorRegex(regex=re.compile('(amdgpu \\w{4}:\\w{2}:\\w{2}\\.\\w:\\s+amdgpu:\\s+\\[\\S+\\]\\s*(?:retry|no-retry)? page fault[^\\n]*)(?:\\n[^\\n]*(amdgpu \\w{4}:\\w{2}:\\w{2}\\.\\w:\\s+amdgpu:[^\\n]*))?(?:\\n[^\\n]*(amdgpu \\w{4}:, re.MULTILINE), message='amdgpu Page Fault', event_category=, event_priority=), ErrorRegex(regex=re.compile('page fault for address.*'), message='Page Fault', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:amdgpu)(.*Fatal error during GPU init)|(Fatal error during GPU init)'), message='Fatal error during GPU init', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:pcieport )(.*AER: aer_status.*)|(aer_status.*)'), message='PCIe AER Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('Failed to read journal file.*'), message='Failed to read journal file', event_category=, event_priority=), ErrorRegex(regex=re.compile('journal corrupted or uncleanly shut down.*'), message='Journal file corrupted or uncleanly shut down', event_category=, event_priority=), ErrorRegex(regex=re.compile('ACPI BIOS Error'), message='ACPI BIOS Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('ACPI Error'), message='ACPI Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('EXT4-fs error \\(device .*\\):'), message='Filesystem corrupted!', event_category=, event_priority=), ErrorRegex(regex=re.compile('(Buffer I\\/O error on dev)(?:ice)? (\\w+)'), message='Error in buffered IO, check filesystem integrity', event_category=, event_priority=), ErrorRegex(regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(Slot\\(\\d+\\)):\\s+(Card not present)'), message='PCIe card no longer present', event_category=, event_priority=), ErrorRegex(regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(Slot\\(\\d+\\)):\\s+(Link Down)'), message='PCIe Link Down', event_category=, event_priority=), ErrorRegex(regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(current common clock configuration is inconsistent, reconfiguring)'), message='Mismatched clock configuration between PCIe device and host', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* correctable hardware errors detected in total in \\w+ block.*)'), message='RAS Correctable Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* uncorrectable hardware errors detected in \\w+ block.*)'), message='RAS Uncorrectable Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* deferred hardware errors detected in \\w+ block.*)'), message='RAS Deferred Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('((?:\\[Hardware Error\\]:\\s+)?event severity: corrected.*)\\n.*(\\[Hardware Error\\]:\\s+Error \\d+, type: corrected.*)\\n.*(\\[Hardware Error\\]:\\s+section_type: PCIe error.*)'), message='RAS Corrected PCIe Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*GPU reset begin.*)'), message='GPU Reset', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*GPU reset(?:\\(\\d+\\))? failed.*)'), message='GPU reset failed', event_category=, event_priority=), ErrorRegex(regex=re.compile('(Accelerator Check Architecture[^\\n]*)(?:\\n[^\\n]*){0,10}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*entry\\[\\d+\\]\\.STATUS=0x[0-9a-fA-F]+)(?:\\n[^\\n]*){0,5}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*entry\\[\\d+\\], re.MULTILINE), message='ACA Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('(Accelerator Check Architecture[^\\n]*)(?:\\n[^\\n]*){0,10}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*CONTROL=0x[0-9a-fA-F]+)(?:\\n[^\\n]*){0,5}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*STATUS=0x[0-9a-fA-F]+)(?:\\n[^\\, re.MULTILINE), message='ACA Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('\\[Hardware Error\\]:.+MC\\d+_STATUS.*(?:\\n.*){0,5}'), message='MCE Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)? (.*Mode2 reset failed.*)'), message='Mode 2 Reset Failed', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*\\[Hardware Error\\]: Corrected error.*)'), message='RAS Corrected Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('x86/cpu: SGX disabled by BIOS'), message='SGX Error', event_category=, event_priority=), ErrorRegex(regex=re.compile('amdgpu \\w{4}:\\w{2}:\\w{2}.\\w: amdgpu: WARN: GPU is throttled.*'), message='GPU Throttled', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\[[^\\]]+\\]\\s*)?LNetError:.*ko2iblnd:\\s*No matching interfaces', re.IGNORECASE), message='LNet: ko2iblnd has no matching interfaces', event_category=, event_priority=), ErrorRegex(regex=re.compile('(?:\\[[^\\]]+\\]\\s*)?LNetError:\\s*.*Error\\s*-?\\d+\\s+starting up LNI\\s+\\w+', re.IGNORECASE), message='LNet: Error starting up LNI', event_category=, event_priority=), ErrorRegex(regex=re.compile('LustreError:.*ptlrpc_init_portals\\(\\).*network initiali[sz]ation failed', re.IGNORECASE), message='Lustre: network initialisation failed', event_category=, event_priority=)]` +- **ERROR_REGEX**: `[ + regex=re.compile('(?:oom_kill_process.*)|(?:Out of memory.*)') message='Out of memory error' event_category= event_priority=, + regex=re.compile('IO_PAGE_FAULT') message='I/O Page Fault' event_category= event_priority=, + regex=re.compile('\\bkernel panic\\b.*', re.IGNORECASE) message='Kernel Panic' event_category= event_priority=, + regex=re.compile('sq_intr') message='SQ Interrupt' event_category= event_priority=, + regex=re.compile('sram_ecc.*') message='SRAM ECC' event_category= event_priority=, + regex=re.compile('\\[amdgpu\\]\\] \\*ERROR\\* hw_init of IP block.*') message='Failed to load driver. IP hardware init error.' event_category= event_priority=, + regex=re.compile('\\[amdgpu\\]\\] \\*ERROR\\* sw_init of IP block.*') message='Failed to load driver. IP software init error.' event_category= event_priority=, + regex=re.compile('sched: RT throttling activated.*') message='Real Time throttling activated' event_category= event_priority=, + regex=re.compile('rcu_preempt detected stalls.*') message='RCU preempt detected stalls' event_category= event_priority=, + regex=re.compile('rcu_preempt self-detected stall.*') message='RCU preempt self-detected stall' event_category= event_priority=, + regex=re.compile('qcm fence wait loop timeout.*') message='QCM fence timeout' event_category= event_priority=, + regex=re.compile('(?:[\\w-]+(?:\\[[0-9.]+\\])?\\s+)?general protection fault[^\\n]*') message='General protection fault' event_category= event_priority=, + regex=re.compile('(?:segfault.*in .*\\[)|(?:[Ss]egmentation [Ff]ault.*)|(?:[Ss]egfault.*)') message='Segmentation fault' event_category= event_priority=, + regex=re.compile('amdgpu: Failed to disallow cf state.*') message='Failed to disallow cf state' event_category= event_priority=, + regex=re.compile('\\*ERROR\\* Failed to terminate tmr.*') message='Failed to terminate tmr' event_category= event_priority=, + regex=re.compile('\\*ERROR\\* suspend of IP block <\\w+> failed.*') message='Suspend of IP block failed' event_category= event_priority=, + regex=re.compile('(amdgpu \\w{4}:\\w{2}:\\w{2}\\.\\w:\\s+amdgpu:\\s+\\[\\S+\\]\\s*(?:retry|no-retry)? page fault[^\\n]*)(?:\\n[^\\n]*(amdgpu \\w{4}:\\w{2}:\\w{2}\\.\\w:\\s+amdgpu:[^\\n]*))?(?:\\n[^\\n]*(amdgpu \\w{4}:, re.MULTILINE) message='amdgpu Page Fault' event_category= event_priority=, + regex=re.compile('page fault for address.*') message='Page Fault' event_category= event_priority=, + regex=re.compile('(?:amdgpu)(.*Fatal error during GPU init)|(Fatal error during GPU init)') message='Fatal error during GPU init' event_category= event_priority=, + regex=re.compile('(?:pcieport )(.*AER: aer_status.*)|(aer_status.*)') message='PCIe AER Error' event_category= event_priority=, + regex=re.compile('Failed to read journal file.*') message='Failed to read journal file' event_category= event_priority=, + regex=re.compile('journal corrupted or uncleanly shut down.*') message='Journal file corrupted or uncleanly shut down' event_category= event_priority=, + regex=re.compile('ACPI BIOS Error') message='ACPI BIOS Error' event_category= event_priority=, + regex=re.compile('ACPI Error') message='ACPI Error' event_category= event_priority=, + regex=re.compile('EXT4-fs error \\(device .*\\):') message='Filesystem corrupted!' event_category= event_priority=, + regex=re.compile('(Buffer I\\/O error on dev)(?:ice)? (\\w+)') message='Error in buffered IO, check filesystem integrity' event_category= event_priority=, + regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(Slot\\(\\d+\\)):\\s+(Card not present)') message='PCIe card no longer present' event_category= event_priority=, + regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(Slot\\(\\d+\\)):\\s+(Link Down)') message='PCIe Link Down' event_category= event_priority=, + regex=re.compile('pcieport (\\w+:\\w+:\\w+\\.\\w+):\\s+(\\w+):\\s+(current common clock configuration is inconsistent, reconfiguring)') message='Mismatched clock configuration between PCIe device and host' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* correctable hardware errors detected in total in \\w+ block.*)') message='RAS Correctable Error' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* uncorrectable hardware errors detected in \\w+ block.*)') message='RAS Uncorrectable Error' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.* deferred hardware errors detected in \\w+ block.*)') message='RAS Deferred Error' event_category= event_priority=, + regex=re.compile('((?:\\[Hardware Error\\]:\\s+)?event severity: corrected.*)\\n.*(\\[Hardware Error\\]:\\s+Error \\d+, type: corrected.*)\\n.*(\\[Hardware Error\\]:\\s+section_type: PCIe error.*)') message='RAS Corrected PCIe Error' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*GPU reset begin.*)') message='GPU Reset' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*GPU reset(?:\\(\\d+\\))? failed.*)') message='GPU reset failed' event_category= event_priority=, + regex=re.compile('(Accelerator Check Architecture[^\\n]*)(?:\\n[^\\n]*){0,10}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*entry\\[\\d+\\]\\.STATUS=0x[0-9a-fA-F]+)(?:\\n[^\\n]*){0,5}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*entry\\[\\d+\\], re.MULTILINE) message='ACA Error' event_category= event_priority=, + regex=re.compile('(Accelerator Check Architecture[^\\n]*)(?:\\n[^\\n]*){0,10}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*CONTROL=0x[0-9a-fA-F]+)(?:\\n[^\\n]*){0,5}?(amdgpu[ 0-9a-fA-F:.]+:? [^\\n]*STATUS=0x[0-9a-fA-F]+)(?:\\n[^\\, re.MULTILINE) message='ACA Error' event_category= event_priority=, + regex=re.compile('\\[Hardware Error\\]:.+MC\\d+_STATUS.*(?:\\n.*){0,5}') message='MCE Error' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)? (.*Mode2 reset failed.*)') message='Mode 2 Reset Failed' event_category= event_priority=, + regex=re.compile('(?:\\d{4}-\\d+-\\d+T\\d+:\\d+:\\d+,\\d+[+-]\\d+:\\d+)?(.*\\[Hardware Error\\]: Corrected error.*)') message='RAS Corrected Error' event_category= event_priority=, + regex=re.compile('x86/cpu: SGX disabled by BIOS') message='SGX Error' event_category= event_priority=, + regex=re.compile('amdgpu \\w{4}:\\w{2}:\\w{2}.\\w: amdgpu: WARN: GPU is throttled.*') message='GPU Throttled' event_category= event_priority=, + regex=re.compile('(?:\\[[^\\]]+\\]\\s*)?LNetError:.*ko2iblnd:\\s*No matching interfaces', re.IGNORECASE) message='LNet: ko2iblnd has no matching interfaces' event_category= event_priority=, + regex=re.compile('(?:\\[[^\\]]+\\]\\s*)?LNetError:\\s*.*Error\\s*-?\\d+\\s+starting up LNI\\s+\\w+', re.IGNORECASE) message='LNet: Error starting up LNI' event_category= event_priority=, + regex=re.compile('LustreError:.*ptlrpc_init_portals\\(\\).*network initiali[sz]ation failed', re.IGNORECASE) message='Lustre: network initialisation failed' event_category= event_priority= +]` + +### Regex Patterns + +*46 items defined* + +- **Built-in Regexes:** +- - Out of memory error: `(?:oom_kill_process.*)|(?:Out of memory.*)` +- - I/O Page Fault: `IO_PAGE_FAULT` +- - Kernel Panic: `\bkernel panic\b.*` +- - SQ Interrupt: `sq_intr` +- - SRAM ECC: `sram_ecc.*` +- - Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*` +- - Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*` +- - Real Time throttling activated: `sched: RT throttling activated.*` +- - RCU preempt detected stalls: `rcu_preempt detected stalls.*` +- - RCU preempt self-detected stall: `rcu_preempt self-detected stall.*` +- - QCM fence timeout: `qcm fence wait loop timeout.*` +- - General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...` +- - Segmentation fault: `(?:segfault.*in .*\[)|(?:[Ss]egmentation [Ff]au...` +- - Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*` +- - Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*` +- - Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*` +- - amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...` +- - Page Fault: `page fault for address.*` +- - Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)|(Fata...` +- - PCIe AER Error: `(?:pcieport )(.*AER: aer_status.*)|(aer_status.*)` +- - Failed to read journal file: `Failed to read journal file.*` +- - Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*` +- - ACPI BIOS Error: `ACPI BIOS Error` +- - ACPI Error: `ACPI Error` +- - Filesystem corrupted!: `EXT4-fs error \(device .*\):` +- - Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)` +- - PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...` +- - PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...` +- - Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...` +- - RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...` +- - GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...` +- - ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...` +- - MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}` +- - Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...` +- - RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....` +- - SGX Error: `x86/cpu: SGX disabled by BIOS` +- - GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...` +- - LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...` +- - LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...` +- - Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` ## Data Analyzer Class KernelAnalyzer @@ -784,6 +1204,34 @@ Check the package version data against the expected package version data **Link to code**: [package_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/package/package_analyzer.py) +## Data Analyzer Class PcieAnalyzer + +### Description + +Check PCIe Data for errors + + This calls checks the following: + - PCIe link status for each BDF + - This checks if the link speed and width are as expected + - AER uncorrectable errors + - Checks PCIe AER uncorrectable error registers UNCORR_ERR_STAT_REG and reports any errors + - AER correctable errors + - Checks the AERs correctable error registers CORR_ERR_STAT_REG and reports any errors + - PCIe device status errors + - Checks PCIe device status errors reported in fields `CORR_ERR_DET` `NON_FATAL_ERR_DET` `FATAL_ERR_DET` `UR_DET` + - PCIe status errors + - Checks PCIe status errors reported in fields `MSTR_DATA_PAR_ERR` `SIGNALED_TARGET_ABORT` `RCVD_TARGET_ABORT` + `RCVD_MSTR_ABORT` `SIGNALED_SYS_ERR` `DET_PARITY_ERR` + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [pcie_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/pcie/pcie_analyzer.py) + +### Class Variables + +- **GPU_BRIDGE_USP_ID**: `0x1501` +- **GPU_BRIDGE_DSP_ID**: `0x1500` + ## Data Analyzer Class ProcessAnalyzer ### Description @@ -826,6 +1274,32 @@ Check sysctl matches expected sysctl details # Analyzer Args +## Analyzer Args Class AmdSmiAnalyzerArgs + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/amdsmi/analyzer_args.py) + +### Annotations / fields + +- **check_static_data**: `bool` +- **expected_gpu_processes**: `Optional[int]` +- **expected_max_power**: `Optional[int]` +- **expected_driver_version**: `Optional[str]` +- **expected_memory_partition_mode**: `Optional[str]` +- **expected_compute_partition_mode**: `Optional[str]` +- **expected_pldm_version**: `Optional[str]` +- **l0_to_recovery_count_error_threshold**: `Optional[int]` +- **l0_to_recovery_count_warning_threshold**: `Optional[int]` +- **vendorid_ep**: `Optional[str]` +- **vendorid_ep_vf**: `Optional[str]` +- **devid_ep**: `Optional[str]` +- **devid_ep_vf**: `Optional[str]` +- **sku_name**: `Optional[str]` +- **expected_xgmi_speed**: `Optional[list[float]]` +- **analysis_range_start**: `Optional[datetime.datetime]` +- **analysis_range_end**: `Optional[datetime.datetime]` + ## Analyzer Args Class BiosAnalyzerArgs **Bases**: ['AnalyzerArgs'] @@ -835,7 +1309,7 @@ Check sysctl matches expected sysctl details ### Annotations / fields - **exp_bios_version**: `list[str]` -- **regex_match**: `` +- **regex_match**: `bool` ## Analyzer Args Class CmdlineAnalyzerArgs @@ -845,8 +1319,20 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **required_cmdline**: `str | list` -- **banned_cmdline**: `str | list` +- **required_cmdline**: `Union[str, list]` +- **banned_cmdline**: `Union[str, list]` + +## Analyzer Args Class DeviceEnumerationAnalyzerArgs + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/device_enumeration/analyzer_args.py) + +### Annotations / fields + +- **cpu_count**: `Optional[list[int]]` +- **gpu_count**: `Optional[list[int]]` +- **vf_count**: `Optional[list[int]]` ## Analyzer Args Class DkmsAnalyzerArgs @@ -856,9 +1342,9 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **dkms_status**: `str | list` -- **dkms_version**: `str | list` -- **regex_match**: `` +- **dkms_status**: `Union[str, list]` +- **dkms_version**: `Union[str, list]` +- **regex_match**: `bool` ## Analyzer Args Class KernelAnalyzerArgs @@ -868,8 +1354,8 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **exp_kernel**: `str | list` -- **regex_match**: `` +- **exp_kernel**: `Union[str, list]` +- **regex_match**: `bool` ## Analyzer Args Class KernelModuleAnalyzerArgs @@ -882,6 +1368,17 @@ Check sysctl matches expected sysctl details - **kernel_modules**: `dict[str, dict]` - **regex_filter**: `list[str]` +## Analyzer Args Class MemoryAnalyzerArgs + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/memory/analyzer_args.py) + +### Annotations / fields + +- **ratio**: `float` +- **memory_threshold**: `str` + ## Analyzer Args Class OsAnalyzerArgs **Bases**: ['AnalyzerArgs'] @@ -890,8 +1387,8 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **exp_os**: `str | list` -- **exact_match**: `` +- **exp_os**: `Union[str, list]` +- **exact_match**: `bool` ## Analyzer Args Class PackageAnalyzerArgs @@ -901,8 +1398,30 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **exp_package_ver**: `dict[str, str | None]` -- **regex_match**: `` +- **exp_package_ver**: `Dict[str, Optional[str]]` +- **regex_match**: `bool` +- **rocm_regex**: `Optional[str]` +- **enable_rocm_regex**: `bool` + +## Analyzer Args Class PcieAnalyzerArgs + +### Description + +Arguments for PCIe analyzer + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/pcie/analyzer_args.py) + +### Annotations / fields + +- **exp_speed**: `int` +- **exp_width**: `int` +- **exp_sriov_count**: `int` +- **exp_gpu_count_override**: `Optional[int]` +- **exp_max_payload_size**: `Union[Dict[int, int], int, NoneType]` +- **exp_max_rd_req_size**: `Union[Dict[int, int], int, NoneType]` +- **exp_ten_bit_tag_req_en**: `Union[Dict[int, int], int, NoneType]` ## Analyzer Args Class ProcessAnalyzerArgs @@ -912,18 +1431,19 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **max_kfd_processes**: `` -- **max_cpu_usage**: `` +- **max_kfd_processes**: `int` +- **max_cpu_usage**: `float` ## Analyzer Args Class RocmAnalyzerArgs -**Bases**: ['BaseModel'] +**Bases**: ['AnalyzerArgs'] **Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/rocm/analyzer_args.py) ### Annotations / fields -- **exp_rocm**: `str | list` +- **exp_rocm**: `Union[str, list]` +- **exp_rocm_latest**: `str` ## Analyzer Args Class SysctlAnalyzerArgs @@ -933,14 +1453,14 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **exp_vm_swappiness**: `typing.Optional[int]` -- **exp_vm_numa_balancing**: `typing.Optional[int]` -- **exp_vm_oom_kill_allocating_task**: `typing.Optional[int]` -- **exp_vm_compaction_proactiveness**: `typing.Optional[int]` -- **exp_vm_compact_unevictable_allowed**: `typing.Optional[int]` -- **exp_vm_extfrag_threshold**: `typing.Optional[int]` -- **exp_vm_zone_reclaim_mode**: `typing.Optional[int]` -- **exp_vm_dirty_background_ratio**: `typing.Optional[int]` -- **exp_vm_dirty_ratio**: `typing.Optional[int]` -- **exp_vm_dirty_writeback_centisecs**: `typing.Optional[int]` -- **exp_kernel_numa_balancing**: `typing.Optional[int]` +- **exp_vm_swappiness**: `Optional[int]` +- **exp_vm_numa_balancing**: `Optional[int]` +- **exp_vm_oom_kill_allocating_task**: `Optional[int]` +- **exp_vm_compaction_proactiveness**: `Optional[int]` +- **exp_vm_compact_unevictable_allowed**: `Optional[int]` +- **exp_vm_extfrag_threshold**: `Optional[int]` +- **exp_vm_zone_reclaim_mode**: `Optional[int]` +- **exp_vm_dirty_background_ratio**: `Optional[int]` +- **exp_vm_dirty_ratio**: `Optional[int]` +- **exp_vm_dirty_writeback_centisecs**: `Optional[int]` +- **exp_kernel_numa_balancing**: `Optional[int]` diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index d6d16cc0..43268e2a 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -36,7 +36,7 @@ import pkgutil import sys from pathlib import Path -from typing import Any, Iterable, List, Type +from typing import Any, Iterable, List, Optional, Type LINK_BASE_DEFAULT = "https://github.com/amd/node-scraper/blob/HEAD/" REL_ROOT_DEFAULT = "nodescraper/plugins/inband" @@ -50,7 +50,7 @@ def get_attr(obj: Any, name: str, default: Any = None) -> Any: return default -def _slice_from_rel_root(p: Path, rel_root: str | None) -> str | None: +def _slice_from_rel_root(p: Path, rel_root: Optional[str]) -> Optional[str]: if not rel_root: return None parts = list(p.parts) @@ -63,7 +63,7 @@ def _slice_from_rel_root(p: Path, rel_root: str | None) -> str | None: return None -def setup_link(class_data, link_base: str, rel_root: str | None) -> str: +def setup_link(class_data, link_base: str, rel_root: Optional[str]) -> str: try: file_location = Path(inspect.getfile(class_data)).resolve() except Exception: @@ -80,7 +80,7 @@ def setup_link(class_data, link_base: str, rel_root: str | None) -> str: return base + rel_path -def get_own_doc(cls: type) -> str | None: +def get_own_doc(cls: type) -> Optional[str]: """ Return only the __doc__ defined in the class itself, ignore inheritance. """ @@ -224,6 +224,57 @@ def add_cmd(s: Any): return cmds +def extract_regexes_and_args_from_analyzer( + analyzer_cls: type, args_cls: Optional[type] +) -> List[str]: + """Extract regex patterns and analyzer args from analyzer class""" + if not inspect.isclass(analyzer_cls): + return [] + + output: List[str] = [] + + # Check for ERROR_REGEX class variable (used by RegexAnalyzer subclasses like DmesgAnalyzer) + error_regex = get_attr(analyzer_cls, "ERROR_REGEX", None) + if error_regex and isinstance(error_regex, list): + output.append("**Built-in Regexes:**") + for item in error_regex: + # ErrorRegex objects have regex, message, event_category attributes + if hasattr(item, "regex"): + pattern = getattr(item.regex, "pattern", None) + message = getattr(item, "message", "") + if pattern: + # Truncate long patterns + pattern_str = pattern if len(pattern) < 50 else pattern[:47] + "..." + output.append(f"- {message}: `{pattern_str}`") + elif hasattr(item, "pattern"): + pattern_str = item.pattern if len(item.pattern) < 50 else item.pattern[:47] + "..." + output.append(f"- `{pattern_str}`") + + # Check for other regex-related attributes + for attr in dir(analyzer_cls): + if "REGEX" in attr.upper() and not attr.startswith("_"): + val = get_attr(analyzer_cls, attr, default=None) + if val is None or attr == "ERROR_REGEX": + continue + + if hasattr(val, "pattern"): + output.append(f"**{attr}**: `{val.pattern}`") + elif isinstance(val, str): + output.append(f"**{attr}**: `{val}`") + + # Extract analyzer args if provided + if inspect.isclass(args_cls): + anns = get_attr(args_cls, "__annotations__", {}) or {} + if anns: + output.append("**Analyzer Args:**") + for key, value in anns.items(): + # Format the type annotation + type_str = format_type_annotation(value) + output.append(f"- `{key}`: {type_str}") + + return output + + def md_header(text: str, level: int = 2) -> str: return f"{'#' * level} {text}\n\n" @@ -243,9 +294,23 @@ def bases_list(cls: type) -> List[str]: return [] +def format_type_annotation(type_ann: Any) -> str: + """ + Format a type annotation for documentation, removing non-deterministic content like function memory addresses. + """ + import re + + type_str = str(type_ann) + type_str = type_str.replace("typing.", "") + type_str = re.sub(r"", r"\1", type_str) + type_str = re.sub(r"", r"\1", type_str) + type_str = re.sub(r"", r"\1", type_str) + return type_str + + def annotations_for_model(model_cls: type) -> List[str]: anns = get_attr(model_cls, "__annotations__", {}) or {} - return [f"**{k}**: `{v}`" for k, v in anns.items()] + return [f"**{k}**: `{format_type_annotation(v)}`" for k, v in anns.items()] def class_vars_dump(cls: type, exclude: set) -> List[str]: @@ -257,7 +322,20 @@ def class_vars_dump(cls: type, exclude: set) -> List[str]: continue if callable(val) or isinstance(val, (staticmethod, classmethod, property)): continue - out.append(f"**{name}**: `{val}`") + + # Format list values with each item on a new line + if isinstance(val, list) and len(val) > 0: + val_str = str(val) + if len(val_str) > 200: + formatted_items = [] + for item in val: + formatted_items.append(f" {item}") + formatted_list = "[\n" + ",\n".join(formatted_items) + "\n]" + out.append(f"**{name}**: `{formatted_list}`") + else: + out.append(f"**{name}**: `{val}`") + else: + out.append(f"**{name}**: `{val}`") return out @@ -279,14 +357,20 @@ def generate_plugin_table_rows(plugins: List[type]) -> List[List[str]]: seen.add(key) uniq.append(c) cmds = uniq + + # Extract regexes and args from analyzer + regex_and_args = [] + if inspect.isclass(an): + regex_and_args = extract_regexes_and_args_from_analyzer(an, args) + rows.append( [ - f"{p.__module__}.{p.__name__}", + p.__name__, + "
".join(cmds).replace("|", "\\|") if cmds else "-", + "
".join(regex_and_args).replace("|", "\\|") if regex_and_args else "-", link_anchor(dm, "model") if inspect.isclass(dm) else "-", link_anchor(col, "collector") if inspect.isclass(col) else "-", link_anchor(an, "analyzer") if inspect.isclass(an) else "-", - link_anchor(args, "args") if inspect.isclass(args) else "-", - "
".join(cmds) if cmds else "-", ] ) return rows @@ -302,7 +386,7 @@ def render_table(headers: List[str], rows: List[List[str]]) -> str: return "".join(out) -def render_collector_section(col: type, link_base: str, rel_root: str | None) -> str: +def render_collector_section(col: type, link_base: str, rel_root: Optional[str]) -> str: hdr = md_header(f"Collector Class {col.__name__}", 2) desc = sanitize_doc(get_own_doc(col) or "") s = hdr @@ -335,7 +419,7 @@ def render_collector_section(col: type, link_base: str, rel_root: str | None) -> return s -def render_analyzer_section(an: type, link_base: str, rel_root: str | None) -> str: +def render_analyzer_section(an: type, link_base: str, rel_root: Optional[str]) -> str: hdr = md_header(f"Data Analyzer Class {an.__name__}", 2) desc = sanitize_doc(get_own_doc(an) or "") s = hdr @@ -350,10 +434,18 @@ def render_analyzer_section(an: type, link_base: str, rel_root: str | None) -> s if cv: s += md_header("Class Variables", 3) + md_list(cv) + # Add regex patterns if present (pass None for args_cls since we don't have context here) + regex_info = extract_regexes_and_args_from_analyzer(an, None) + if regex_info: + s += md_header("Regex Patterns", 3) + if len(regex_info) > 10: + s += f"*{len(regex_info)} items defined*\n\n" + s += md_list(regex_info) + return s -def render_model_section(model: type, link_base: str, rel_root: str | None) -> str: +def render_model_section(model: type, link_base: str, rel_root: Optional[str]) -> str: hdr = md_header(f"{model.__name__} Model", 2) desc = sanitize_doc(get_own_doc(model) or "") s = hdr @@ -368,7 +460,7 @@ def render_model_section(model: type, link_base: str, rel_root: str | None) -> s return s -def render_analyzer_args_section(args_cls: type, link_base: str, rel_root: str | None) -> str: +def render_analyzer_args_section(args_cls: type, link_base: str, rel_root: Optional[str]) -> str: hdr = md_header(f"Analyzer Args Class {args_cls.__name__}", 2) desc = sanitize_doc(get_own_doc(args_cls) or "") s = hdr @@ -380,7 +472,7 @@ def render_analyzer_args_section(args_cls: type, link_base: str, rel_root: str | anns = get_attr(args_cls, "__annotations__", {}) or {} if anns: - ann_items = [f"**{k}**: `{v}`" for k, v in anns.items()] + ann_items = [f"**{k}**: `{format_type_annotation(v)}`" for k, v in anns.items()] s += md_header("Annotations / fields", 3) + md_list(ann_items) return s @@ -418,7 +510,14 @@ def all_subclasses(cls: Type) -> set[type]: plugins.sort(key=lambda c: f"{c.__module__}.{c.__name__}".lower()) rows = generate_plugin_table_rows(plugins) - headers = ["Plugin", "DataModel", "Collector", "Analyzer", "AnalyzerArgs", "Cmd(s)"] + headers = [ + "Plugin", + "Collection", + "Analysis", + "DataModel", + "Collector", + "Analyzer", + ] collectors, analyzers, models, args_classes = [], [], [], [] seen_c, seen_a, seen_m, seen_args = set(), set(), set(), set() diff --git a/docs/node-scraper-external/ext_nodescraper_plugins/sample/__init__.py b/docs/node-scraper-external/ext_nodescraper_plugins/sample/__init__.py index e69de29b..fdb6d9c8 100644 --- a/docs/node-scraper-external/ext_nodescraper_plugins/sample/__init__.py +++ b/docs/node-scraper-external/ext_nodescraper_plugins/sample/__init__.py @@ -0,0 +1,25 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### diff --git a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_analyzer.py b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_analyzer.py index f578f3ad..049701cd 100644 --- a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_analyzer.py +++ b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_analyzer.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus from nodescraper.interfaces import DataAnalyzer from nodescraper.models import TaskResult diff --git a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_collector.py b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_collector.py index 3a860ab9..83a6da5c 100644 --- a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_collector.py +++ b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_collector.py @@ -1,3 +1,31 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from typing import Optional + from nodescraper.base import InBandDataCollector from nodescraper.enums import ExecutionStatus from nodescraper.models import TaskResult @@ -9,7 +37,7 @@ class SampleCollector(InBandDataCollector[SampleDataModel, None]): DATA_MODEL = SampleDataModel - def collect_data(self, args=None) -> tuple[TaskResult, SampleDataModel | None]: + def collect_data(self, args=None) -> tuple[TaskResult, Optional[SampleDataModel]]: sample_data = SampleDataModel(some_str="example123") self.result.message = "Collector ran successfully" self.result.status = ExecutionStatus.OK diff --git a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_data.py b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_data.py index 85f650de..f58fb4f2 100644 --- a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_data.py +++ b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_data.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + from nodescraper.models import DataModel diff --git a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_plugin.py b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_plugin.py index b30a7648..76efa467 100644 --- a/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_plugin.py +++ b/docs/node-scraper-external/ext_nodescraper_plugins/sample/sample_plugin.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + from nodescraper.base import InBandDataPlugin from .sample_analyzer import SampleAnalyzer diff --git a/nodescraper/__init__.py b/nodescraper/__init__.py index fdb6d9c8..cf977e6d 100644 --- a/nodescraper/__init__.py +++ b/nodescraper/__init__.py @@ -23,3 +23,10 @@ # SOFTWARE. # ############################################################################### + +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("node-scraper") +except PackageNotFoundError: + __version__ = "unknown" diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index c5e7abeb..81c18f19 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -32,6 +32,7 @@ import sys from typing import Optional +import nodescraper from nodescraper.cli.constants import DEFAULT_CONFIG, META_VAR_MAP from nodescraper.cli.dynamicparserbuilder import DynamicParserBuilder from nodescraper.cli.helper import ( @@ -79,6 +80,12 @@ def build_parser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {nodescraper.__version__}", + ) + parser.add_argument( "--sys-name", default=platform.node(), help="System name", metavar=META_VAR_MAP[str] ) @@ -327,6 +334,7 @@ def process_args( plugin_arg_index = -1 plugin_arg_map = {} + invalid_plugins = [] if plugin_arg_index != -1 and plugin_arg_index != len(raw_arg_input) - 1: top_level_args = raw_arg_input[: plugin_arg_index + 1] plugin_args = raw_arg_input[plugin_arg_index + 1 :] @@ -337,12 +345,26 @@ def process_args( else: cur_plugin = None for arg in plugin_args: - if arg in plugin_names: + # Handle comma-separated plugin names (but not arguments) + if not arg.startswith("-") and "," in arg: + # Split comma-separated plugin names + for potential_plugin in arg.split(","): + potential_plugin = potential_plugin.strip() + if potential_plugin in plugin_names: + plugin_arg_map[potential_plugin] = [] + cur_plugin = potential_plugin + elif potential_plugin: + # Track invalid plugin names to log event later + invalid_plugins.append(potential_plugin) + elif arg in plugin_names: plugin_arg_map[arg] = [] cur_plugin = arg elif cur_plugin: plugin_arg_map[cur_plugin].append(arg) - return (top_level_args, plugin_arg_map) + elif not arg.startswith("-"): + # Track invalid plugin names to log event later + invalid_plugins.append(arg) + return (top_level_args, plugin_arg_map, invalid_plugins) def main(arg_input: Optional[list[str]] = None): @@ -360,7 +382,9 @@ def main(arg_input: Optional[list[str]] = None): parser, plugin_subparser_map = build_parser(plugin_reg, config_reg) try: - top_level_args, plugin_arg_map = process_args(arg_input, list(plugin_subparser_map.keys())) + top_level_args, plugin_arg_map, invalid_plugins = process_args( + arg_input, list(plugin_subparser_map.keys()) + ) parsed_args = parser.parse_args(top_level_args) system_info = get_system_info(parsed_args) @@ -380,6 +404,13 @@ def main(arg_input: Optional[list[str]] = None): if log_path: logger.info("Log path: %s", log_path) + # Log warning if invalid plugin names were provided + if invalid_plugins: + logger.warning( + "Invalid plugin name(s) ignored: %s. Use 'describe plugin' to list available plugins.", + ", ".join(invalid_plugins), + ) + if parsed_args.subcmd == "summary": generate_summary(parsed_args.search_path, parsed_args.output_path, logger) sys.exit(0) diff --git a/nodescraper/enums/eventcategory.py b/nodescraper/enums/eventcategory.py index a7e52d88..553119a8 100644 --- a/nodescraper/enums/eventcategory.py +++ b/nodescraper/enums/eventcategory.py @@ -63,6 +63,8 @@ class EventCategory(AutoNameStrEnum): SBIOS/VBIOS/IFWI Errors - INFRASTRUCTURE Network, IT issues, Downtime + - NETWORK + Network configuration, interfaces, routing, neighbors, ethtool data - RUNTIME Framework issues, does not include content failures - UNKNOWN @@ -82,5 +84,6 @@ class EventCategory(AutoNameStrEnum): SW_DRIVER = auto() BIOS = auto() INFRASTRUCTURE = auto() + NETWORK = auto() RUNTIME = auto() UNKNOWN = auto() diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index effd3029..16d1a70b 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -107,6 +107,8 @@ def _build_event( data = {"task_name": self.__class__.__name__, "task_type": self.TASK_TYPE} else: + # Copy to avoid mutating the caller's dict + data = copy.copy(data) data["task_name"] = self.__class__.__name__ data["task_type"] = self.TASK_TYPE diff --git a/nodescraper/models/analyzerargs.py b/nodescraper/models/analyzerargs.py index 209f6a0e..f1782801 100644 --- a/nodescraper/models/analyzerargs.py +++ b/nodescraper/models/analyzerargs.py @@ -23,12 +23,61 @@ # SOFTWARE. # ############################################################################### -from pydantic import BaseModel +from typing import Any + +from pydantic import BaseModel, model_validator class AnalyzerArgs(BaseModel): + """Base class for all analyzer arguments. + + This class provides automatic string stripping for all string values + in analyzer args. All analyzer args classes should inherit from this + directly. + + """ + model_config = {"extra": "forbid", "exclude_none": True} + @model_validator(mode="before") + @classmethod + def strip_string_values(cls, data: Any) -> Any: + """Strip whitespace from all string values in analyzer args. + + This validator recursively processes: + - String values: strips whitespace + - Lists: strips strings in lists + - Dicts: strips string values in dicts + - Other types: left unchanged + + Args: + data: The input data to validate + + Returns: + The data with all string values stripped + """ + if isinstance(data, dict): + return {k: cls._strip_value(v) for k, v in data.items()} + return data + + @classmethod + def _strip_value(cls, value: Any) -> Any: + """Recursively strip string values. + + Args: + value: The value to process + + Returns: + The processed value + """ + if isinstance(value, str): + return value.strip() + elif isinstance(value, list): + return [cls._strip_value(item) for item in value] + elif isinstance(value, dict): + return {k: cls._strip_value(v) for k, v in value.items()} + return value + @classmethod def build_from_model(cls, datamodel): """Build analyzer args instance from data model object diff --git a/nodescraper/models/systeminfo.py b/nodescraper/models/systeminfo.py index d2be4ae4..e82d6212 100644 --- a/nodescraper/models/systeminfo.py +++ b/nodescraper/models/systeminfo.py @@ -41,3 +41,4 @@ class SystemInfo(BaseModel): platform: Optional[str] = None metadata: Optional[dict] = Field(default_factory=dict) location: Optional[SystemLocation] = SystemLocation.LOCAL + vendorid_ep: int = 0x1002 diff --git a/nodescraper/models/taskresult.py b/nodescraper/models/taskresult.py index f4dbb251..dd5990ef 100644 --- a/nodescraper/models/taskresult.py +++ b/nodescraper/models/taskresult.py @@ -103,28 +103,40 @@ def duration(self) -> Optional[str]: return duration def _get_event_summary(self) -> str: - """Get summary string for artifacts + """Get summary string for events Returns: - str: artifact summary + str: event summary with counts and descriptions """ - error_count = 0 - warning_count = 0 + error_msg_counts: dict[str, int] = {} + warning_msg_counts: dict[str, int] = {} for event in self.events: if event.priority == EventPriority.WARNING: - warning_count += 1 + warning_msg_counts[event.description] = ( + warning_msg_counts.get(event.description, 0) + 1 + ) elif event.priority >= EventPriority.ERROR: - error_count += 1 - - summary_list = [] - - if warning_count: - summary_list.append(f"{warning_count} warnings") - if error_count: - summary_list.append(f"{error_count} errors") - - return "|".join(summary_list) + error_msg_counts[event.description] = error_msg_counts.get(event.description, 0) + 1 + + summary_parts = [] + + if warning_msg_counts: + total_warnings = sum(warning_msg_counts.values()) + warning_details = [ + f"{msg} (x{count})" if count > 1 else msg + for msg, count in warning_msg_counts.items() + ] + summary_parts.append(f"{total_warnings} warnings: {', '.join(warning_details)}") + + if error_msg_counts: + total_errors = sum(error_msg_counts.values()) + error_details = [ + f"{msg} (x{count})" if count > 1 else msg for msg, count in error_msg_counts.items() + ] + summary_parts.append(f"{total_errors} errors: {', '.join(error_details)}") + + return "; ".join(summary_parts) def _update_status(self) -> None: """Update overall status based on event priority""" diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index e47a6cc8..d03010c6 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -173,6 +173,12 @@ def run_queue(self) -> list[PluginResult]: global_run_args = self.apply_global_args_to_plugin( plugin_inst, plugin_class, self.plugin_config.global_args ) + # Merge analysis_args and collection_args + for args_key in ["analysis_args", "collection_args"]: + if args_key in global_run_args and args_key in run_payload: + # Merge: global args override plugin-specific args keys specified in both global and plugin-specific args + run_payload[args_key].update(global_run_args[args_key]) + del global_run_args[args_key] run_payload.update(global_run_args) except ValueError as ve: self.logger.error( diff --git a/nodescraper/plugins/inband/amdsmi/__init__.py b/nodescraper/plugins/inband/amdsmi/__init__.py new file mode 100644 index 00000000..f117a9fd --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .amdsmi_plugin import AmdSmiPlugin + +__all__ = ["AmdSmiPlugin"] diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py new file mode 100644 index 00000000..085f022f --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -0,0 +1,821 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import io +from collections import defaultdict +from typing import Any, Optional, Union + +from nodescraper.enums import EventCategory, EventPriority +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult + +from .amdsmidata import ( + AmdSmiDataModel, + AmdSmiMetric, + AmdSmiStatic, + AmdSmiTstData, + EccData, + Fw, + Partition, + Processes, + XgmiMetrics, +) +from .analyzer_args import AmdSmiAnalyzerArgs +from .cper import CperAnalysisTaskMixin + + +class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]): + """Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics""" + + DATA_MODEL = AmdSmiDataModel + + def check_expected_max_power( + self, + amdsmi_static_data: list[AmdSmiStatic], + expected_max_power: int, + ): + """Check against expected max power + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + expected_max_power (int): expected max power + """ + incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {} + for gpu in amdsmi_static_data: + if gpu.limit is None or gpu.limit.max_power is None: + self._log_event( + category=EventCategory.PLATFORM, + description=f"GPU: {gpu.gpu} has no max power limit set", + priority=EventPriority.WARNING, + data={"gpu": gpu.gpu}, + ) + continue + max_power_value = gpu.limit.max_power.value + try: + max_power_float = float(max_power_value) + except ValueError: + self._log_event( + category=EventCategory.PLATFORM, + description=f"GPU: {gpu.gpu} has an invalid max power limit", + priority=EventPriority.ERROR, + data={ + "gpu": gpu.gpu, + "max_power_value": max_power_value, + }, + ) + continue + if max_power_float != expected_max_power: + incorrect_max_power_gpus[gpu.gpu] = max_power_float + if incorrect_max_power_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Max power mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": list(incorrect_max_power_gpus.keys()), + "max_power_values": incorrect_max_power_gpus, + "expected_max_power": expected_max_power, + }, + ) + + def check_expected_driver_version( + self, + amdsmi_static_data: list[AmdSmiStatic], + expected_driver_version: str, + ) -> None: + """Check expectecd driver version + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + expected_driver_version (str): expected driver version + """ + bad_driver_gpus: list[int] = [] + + versions_by_gpu: dict[int, Optional[str]] = {} + for gpu in amdsmi_static_data: + ver: Optional[str] = None + if gpu.driver is not None: + ver = gpu.driver.version + versions_by_gpu[gpu.gpu] = ver + if ver != expected_driver_version: + bad_driver_gpus.append(gpu.gpu) + + if bad_driver_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Driver Version Mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": bad_driver_gpus, + "driver_version": {g: versions_by_gpu[g] for g in bad_driver_gpus}, + "expected_driver_version": expected_driver_version, + }, + ) + + def check_amdsmi_metric_pcie( + self, + amdsmi_metric_data: list[AmdSmiMetric], + l0_to_recovery_count_error_threshold: int, + l0_to_recovery_count_warning_threshold: int, + ): + """Check PCIe metrics for link errors + + Checks for PCIe link width, speed, replays, recoveries, and NAKs. + Expected width/speeds should come from SKU info. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + l0_to_recovery_count_error_threshold (int): Threshold for error events + l0_to_recovery_count_warning_threshold (int): Threshold for warning events + """ + for metric in amdsmi_metric_data: + pcie_data = metric.pcie + gpu = metric.gpu + + if pcie_data.width is not None and pcie_data.width != 16: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} PCIe width is not x16", + priority=EventPriority.ERROR, + data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16}, + console_log=True, + ) + + if pcie_data.speed is not None and pcie_data.speed.value is not None: + try: + speed_val = float(pcie_data.speed.value) + if speed_val != 32.0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)", + priority=EventPriority.ERROR, + data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0}, + console_log=True, + ) + except (ValueError, TypeError): + pass + + if pcie_data.replay_count is not None and pcie_data.replay_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}", + priority=EventPriority.WARNING, + data={"gpu": gpu, "replay_count": pcie_data.replay_count}, + console_log=True, + ) + + if ( + pcie_data.replay_roll_over_count is not None + and pcie_data.replay_roll_over_count > 0 + ): + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}", + priority=EventPriority.WARNING, + data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count}, + console_log=True, + ) + + if pcie_data.l0_to_recovery_count is not None: + if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", + priority=EventPriority.ERROR, + data={ + "gpu": gpu, + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, + "error_threshold": l0_to_recovery_count_error_threshold, + }, + console_log=True, + ) + elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, + "warning_threshold": l0_to_recovery_count_warning_threshold, + }, + console_log=True, + ) + + if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs", + priority=EventPriority.WARNING, + data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count}, + console_log=True, + ) + + if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs", + priority=EventPriority.WARNING, + data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count}, + console_log=True, + ) + + def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]): + """Check ECC totals for all GPUs + + Raises errors for uncorrectable errors, warnings for correctable and deferred. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + """ + for metric in amdsmi_metric_data: + ecc_totals = metric.ecc + gpu = metric.gpu + + ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [ + ( + EventPriority.WARNING, + ecc_totals.total_correctable_count, + "Total correctable ECC errors", + ), + ( + EventPriority.ERROR, + ecc_totals.total_uncorrectable_count, + "Total uncorrectable ECC errors", + ), + ( + EventPriority.WARNING, + ecc_totals.total_deferred_count, + "Total deferred ECC errors", + ), + ( + EventPriority.WARNING, + ecc_totals.cache_correctable_count, + "Cache correctable ECC errors", + ), + ( + EventPriority.ERROR, + ecc_totals.cache_uncorrectable_count, + "Cache uncorrectable ECC errors", + ), + ] + + for priority, count, desc in ecc_checks: + if count is not None and count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has {desc}: {count}", + priority=priority, + data={"gpu": gpu, "error_count": count, "error_type": desc}, + console_log=True, + ) + + def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]): + """Check ECC counts in all blocks for all GPUs + + Raises errors for uncorrectable errors, warnings for correctable and deferred. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + """ + for metric in amdsmi_metric_data: + gpu = metric.gpu + ecc_blocks = metric.ecc_blocks + + # Skip if ecc_blocks is a string (e.g., "N/A") or empty + if isinstance(ecc_blocks, str) or not ecc_blocks: + continue + + for block_name, ecc_data in ecc_blocks.items(): + if not isinstance(ecc_data, EccData): + continue + + if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has correctable ECC errors in block {block_name}", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "block": block_name, + "correctable_count": ecc_data.correctable_count, + }, + console_log=True, + ) + + if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}", + priority=EventPriority.ERROR, + data={ + "gpu": gpu, + "block": block_name, + "uncorrectable_count": ecc_data.uncorrectable_count, + }, + console_log=True, + ) + + if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has deferred ECC errors in block {block_name}", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "block": block_name, + "deferred_count": ecc_data.deferred_count, + }, + console_log=True, + ) + + def expected_gpu_processes( + self, processes_data: Optional[list[Processes]], max_num_processes: int + ): + """Check the number of GPU processes running + + Args: + processes_data (Optional[list[Processes]]): list of processes per GPU + max_num_processes (int): max number of expected processes + """ + gpu_exceeds_num_processes: dict[int, int] = {} + if processes_data is None or len(processes_data) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No GPU processes data available", + priority=EventPriority.WARNING, + data={"processes_data": processes_data}, + console_log=True, + ) + return + for process in processes_data: + if len(process.process_list) == 0 or isinstance( + process.process_list[0].process_info, str + ): + # Skip if there are no processes + continue + + process_count = len(process.process_list) + if process_count > max_num_processes: + gpu_exceeds_num_processes[process.gpu] = process_count + + if gpu_exceeds_num_processes: + self._log_event( + category=EventCategory.PLATFORM, + description="Number of processes exceeds max processes", + priority=EventPriority.ERROR, + data={ + "gpu_exceeds_num_processes": gpu_exceeds_num_processes, + }, + console_log=True, + ) + + def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): + """Check consistency of expected data + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + """ + consistancy_data: dict[str, Union[set[str], set[int]]] = { + "market_name": {gpu.asic.market_name for gpu in amdsmi_static_data}, + "vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data}, + "vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data}, + "subvendor_id": {gpu.asic.subvendor_id for gpu in amdsmi_static_data}, + "subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data}, + "device_id": {gpu.asic.device_id for gpu in amdsmi_static_data}, + "rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data}, + "num_compute_units": {str(gpu.asic.num_compute_units) for gpu in amdsmi_static_data}, + "target_graphics_version": { + gpu.asic.target_graphics_version for gpu in amdsmi_static_data + }, + } + for key, value in consistancy_data.items(): + if len(value) > 1: + self._log_event( + category=EventCategory.PLATFORM, + description=f"{key} is not consistent across all GPUs", + priority=EventPriority.WARNING, + data={ + "field": key, + "non_consistent_values": value, + }, + ) + + def check_static_data( + self, + amdsmi_static_data: list[AmdSmiStatic], + vendor_id: Optional[str], + subvendor_id: Optional[str], + device_id: tuple[Optional[str], Optional[str]], + subsystem_id: tuple[Optional[str], Optional[str]], + sku_name: Optional[str], + ) -> None: + """Check expected static data + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data + vendor_id (Optional[str]): expected vendor_id + subvendor_id (Optional[str]): expected subvendor_id + device_id (tuple[Optional[str], Optional[str]]): expected device_id + subsystem_id (tuple[Optional[str], Optional[str]]): expected subsystem_id + sku_name (Optional[str]): expected sku_name + """ + + mismatches: list[tuple[int, str, str, str]] = [] + + expected_data: dict[str, Optional[str]] = { + "vendor_id": vendor_id, + "subvendor_id": subvendor_id, + "vendor_name": "Advanced Micro Devices Inc", + "market_name": sku_name, + } + + for gpu_data in amdsmi_static_data: + collected_data: dict[str, str] = { + "vendor_id": gpu_data.asic.vendor_id, + "subvendor_id": gpu_data.asic.subvendor_id, + "vendor_name": gpu_data.asic.vendor_name, + "market_name": gpu_data.asic.market_name, + } + + for key, expected in expected_data.items(): + if expected is None: + continue + actual = collected_data[key] + if expected not in actual: + mismatches.append((gpu_data.gpu, key, expected, actual)) + break + + if device_id[0] is not None and device_id[1] is not None: + dev_actual = gpu_data.asic.device_id + if ( + device_id[0].upper() not in dev_actual.upper() + and device_id[1].upper() not in dev_actual.upper() + ): + mismatches.append( + (gpu_data.gpu, "device_id", f"{device_id[0]}|{device_id[1]}", dev_actual) + ) + + if subsystem_id[0] is not None and subsystem_id[1] is not None: + subsys_actual = gpu_data.asic.subsystem_id + if ( + subsystem_id[0].upper() not in subsys_actual.upper() + and subsystem_id[1].upper() not in subsys_actual.upper() + ): + mismatches.append( + ( + gpu_data.gpu, + "subsystem_id", + f"{subsystem_id[0]}|{subsystem_id[1]}", + subsys_actual, + ) + ) + + if mismatches: + payload = self._format_static_mismatch_payload(mismatches) + self._log_event( + category=EventCategory.PLATFORM, + description="amd-smi static data mismatch", + priority=EventPriority.ERROR, + data=payload, + ) + + def _format_static_mismatch_payload( + self, + mismatches: list[tuple[int, str, str, str]], + ) -> dict[str, Any]: + """Helper function for pretty printing mismatch in expected data + + Args: + mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU + + Returns: + dict[str, Any]: dict of mismatched data per GPU + """ + per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list) + field_set: set[str] = set() + + for gpu, field, expected, actual in mismatches: + field_set.add(field) + per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual}) + + per_gpu_list: list[dict[str, Any]] = [ + {"gpu": gpu, "mismatches": entries} + for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0]) + ] + + return { + "summary": { + "gpus_affected": len(per_gpu), + "fields": sorted(field_set), + "total_mismatches": sum(len(v) for v in per_gpu.values()), + }, + "per_gpu": per_gpu_list, + } + + def check_pldm_version( + self, + amdsmi_fw_data: Optional[list[Fw]], + expected_pldm_version: Optional[str], + ): + """Check expected pldm version + + Args: + amdsmi_fw_data (Optional[list[Fw]]): data model + expected_pldm_version (Optional[str]): expected pldm version + """ + PLDM_STRING = "PLDM_BUNDLE" + if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI firmware data available", + priority=EventPriority.WARNING, + data={"amdsmi_fw_data": amdsmi_fw_data}, + ) + return + mismatched_gpus: list[int] = [] + pldm_missing_gpus: list[int] = [] + for fw_data in amdsmi_fw_data: + gpu = fw_data.gpu + if isinstance(fw_data.fw_list, str): + pldm_missing_gpus.append(gpu) + continue + for fw_info in fw_data.fw_list: + if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: + mismatched_gpus.append(gpu) + if PLDM_STRING == fw_info.fw_id: + break + else: + pldm_missing_gpus.append(gpu) + + if mismatched_gpus or pldm_missing_gpus: + self._log_event( + category=EventCategory.FW, + description="PLDM Version Mismatch", + priority=EventPriority.ERROR, + data={ + "mismatched_gpus": mismatched_gpus, + "pldm_missing_gpus": pldm_missing_gpus, + "expected_pldm_version": expected_pldm_version, + }, + ) + + def check_expected_memory_partition_mode( + self, + partition_data: Optional[Partition], + expected_memory_partition_mode: Optional[str], + expected_compute_partition_mode: Optional[str], + ): + """Check expected mem partition mode + + Args: + partition_data (Optional[Partition]): data model + expected_memory_partition_mode (Optional[str]): expected mem partition mode + expected_compute_partition_mode (Optional[str]): expected compute partition mode + """ + if partition_data is None: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI Partition data not available", + priority=EventPriority.WARNING, + ) + return + bad_memory_partition_mode_gpus = [] + for partition_current in partition_data.memory_partition: + if ( + expected_memory_partition_mode is not None + and partition_current.partition_type != expected_memory_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": partition_current.gpu_id, + "memory_partition_mode": partition_current.partition_type, + } + ) + + for compute_current in partition_data.compute_partition: + if ( + expected_compute_partition_mode is not None + and compute_current.partition_type != expected_compute_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": compute_current.gpu_id, + "compute_partition_mode": compute_current.partition_type, + } + ) + + if bad_memory_partition_mode_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Partition Mode Mismatch", + priority=EventPriority.ERROR, + data={ + "actual_partition_data": bad_memory_partition_mode_gpus, + "expected_memory_partition_mode": expected_memory_partition_mode, + "expected_compute_partition_mode": expected_compute_partition_mode, + }, + ) + + def check_expected_xgmi_link_speed( + self, + xgmi_metric: Optional[list[XgmiMetrics]], + expected_xgmi_speed: Optional[list[float]] = None, + ): + """Check the XGMI link speed for all GPUs + + Args: + xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data + expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s) + """ + if xgmi_metric is None or len(xgmi_metric) == 0: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed data is not available and cannot be checked", + priority=EventPriority.WARNING, + data={"xgmi_metric": xgmi_metric}, + ) + return + + if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0: + self._log_event( + category=EventCategory.IO, + description="Expected XGMI speed not configured, skipping XGMI link speed check", + priority=EventPriority.WARNING, + ) + return + + for xgmi_data in xgmi_metric: + link_metric = xgmi_data.link_metrics + try: + if link_metric.bit_rate is None or link_metric.bit_rate.value is None: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not available", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": ( + link_metric.bit_rate.unit if link_metric.bit_rate else "N/A" + ), + }, + ) + continue + + xgmi_float = float(link_metric.bit_rate.value) + except ValueError: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not a valid number", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": ( + link_metric.bit_rate.value if link_metric.bit_rate else "N/A" + ), + }, + ) + continue + + if xgmi_float not in expected_xgmi_speed: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not as expected", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": xgmi_float, + "expected_xgmi_speed": expected_xgmi_speed, + }, + console_log=True, + ) + + def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData): + """Check AMD SMI test results + + Args: + amdsmitst_data (AmdSmiTstData): AMD SMI test data + """ + if amdsmitst_data.failed_test_count > 0: + self._log_event( + category=EventCategory.APPLICATION, + description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst", + priority=EventPriority.ERROR, + data={ + "failed_test_count": amdsmitst_data.failed_test_count, + "failed_tests": amdsmitst_data.failed_tests, + }, + console_log=True, + ) + + def analyze_data( + self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None + ) -> TaskResult: + """Analyze the amdsmi data against expected data + + Args: + data (AmdSmiDataModel): the AmdSmi data model + args (_type_, optional): optional AmdSmi analyzer args. Defaults to None. + + Returns: + TaskResult: the result of the analysis indicating weather the AmdSmi data model + matched the expected data + """ + + if args is None: + args = AmdSmiAnalyzerArgs() + + if data.metric is not None and len(data.metric) > 0: + if args.l0_to_recovery_count_error_threshold is not None: + self.check_amdsmi_metric_pcie( + data.metric, + args.l0_to_recovery_count_error_threshold, + args.l0_to_recovery_count_warning_threshold or 1, + ) + self.check_amdsmi_metric_ecc_totals(data.metric) + self.check_amdsmi_metric_ecc(data.metric) + + if args.expected_gpu_processes: + self.expected_gpu_processes(data.process, args.expected_gpu_processes) + + if data.static is None or len(data.static) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI static data available", + priority=EventPriority.WARNING, + data={"amdsmi_static_data": data.static}, + ) + else: + if args.expected_max_power: + self.check_expected_max_power(data.static, args.expected_max_power) + if args.expected_driver_version: + self.check_expected_driver_version(data.static, args.expected_driver_version) + + self.static_consistancy_check(data.static) + if ( + self.system_info.sku + and args.devid_ep + and args.devid_ep_vf + and args.vendorid_ep + and args.check_static_data + ) or args.check_static_data: + self.check_static_data( + data.static, + args.vendorid_ep, + args.vendorid_ep, + (args.devid_ep, args.devid_ep), + (args.devid_ep, args.devid_ep), + sku_name=args.sku_name, + ) + + if args.expected_memory_partition_mode or args.expected_compute_partition_mode: + self.check_expected_memory_partition_mode( + data.partition, + args.expected_memory_partition_mode, + args.expected_compute_partition_mode, + ) + + if args.expected_pldm_version: + self.check_pldm_version(data.firmware, args.expected_pldm_version) + + if data.cper_data: + self.analyzer_cpers( + { + file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents) + for file_model_obj in data.cper_data + }, + analysis_range_start=args.analysis_range_start, + analysis_range_end=args.analysis_range_end, + ) + + if data.xgmi_metric and len(data.xgmi_metric) > 0: + self.check_expected_xgmi_link_speed( + data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed + ) + + if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0: + self.check_amdsmitst(data.amdsmitst_data) + + return self.result diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py new file mode 100644 index 00000000..4c78e2f5 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -0,0 +1,1313 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import io +import json +import re +from tarfile import TarFile +from typing import Any, Dict, List, Optional, Union + +from pydantic import ValidationError + +from nodescraper.base.inbandcollectortask import InBandDataCollector +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily +from nodescraper.models import TaskResult +from nodescraper.models.datamodel import FileModel +from nodescraper.plugins.inband.amdsmi.amdsmidata import ( + AmdSmiDataModel, + AmdSmiListItem, + AmdSmiStatic, + AmdSmiVersion, + EccState, + Fw, + FwListItem, + Partition, + PartitionCompute, + PartitionMemory, + Processes, + ProcessInfo, + ProcessListItem, + ProcessMemoryUsage, + ProcessUsage, + StaticAsic, + StaticBoard, + StaticBus, + StaticCacheInfoItem, + StaticClockData, + StaticDriver, + StaticFrequencyLevels, + StaticNuma, + StaticPolicy, + StaticRas, + StaticSocPstate, + StaticVbios, + StaticVram, + StaticXgmiPlpd, + ValueUnit, +) +from nodescraper.utils import get_exception_traceback + + +class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): + """Class for collection of inband tool amd-smi data.""" + + AMD_SMI_EXE = "amd-smi" + + SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} + + DATA_MODEL = AmdSmiDataModel + + CMD_VERSION = "version --json" + CMD_LIST = "list --json" + CMD_PROCESS = "process --json" + CMD_PARTITION = "partition --json" + CMD_FIRMWARE = "firmware --json" + CMD_STATIC = "static -g all --json" + CMD_STATIC_GPU = "static -g {gpu_id} --json" + CMD_RAS = "ras --cper --folder={folder}" + + def _check_amdsmi_installed(self) -> bool: + """Check if amd-smi is installed + + Returns: + bool: True if amd-smi is installed, False otherwise + """ + cmd_ret = self._run_sut_cmd("which amd-smi") + return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout) + + def _run_amd_smi(self, cmd: str) -> Optional[str]: + """Run amd-smi command + + Args: + cmd (str): command arguments to pass to amd-smi + + Returns: + Optional[str]: stdout from command or None on error + """ + cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}") + + # Check for known warnings and errors that can be handled + is_group_warning = ( + "User is missing the following required groups" in cmd_ret.stderr + or "User is missing the following required groups" in cmd_ret.stdout + ) + + # Check for known amd-smi internal errors + is_amdsmi_internal_error = any( + pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"] + ) + + # Log warning if user is missing group + if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + if is_amdsmi_internal_error: + self._log_event( + category=EventCategory.SW_DRIVER, + description="amd-smi internal error detected", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.WARNING, + console_log=True, + ) + return None + elif not is_group_warning: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi command", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + else: + self._log_event( + category=EventCategory.APPLICATION, + description="amd-smi warning (continuing): User missing required groups", + data={ + "command": cmd, + "warning": cmd_ret.stderr or cmd_ret.stdout, + }, + priority=EventPriority.WARNING, + console_log=False, + ) + + stdout = cmd_ret.stdout + if is_group_warning and stdout: + lines = stdout.split("\n") + cleaned_lines = [ + line + for line in lines + if not any( + warn in line + for warn in [ + "RuntimeError:", + "WARNING: User is missing", + "Please add user to these groups", + ] + ) + ] + stdout = "\n".join(cleaned_lines).strip() + + return stdout + + def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: + """Run amd-smi command with json output + + Args: + cmd (str): command arguments to pass to amd-smi + + Returns: + Optional[Union[dict, list[dict]]]: parsed JSON output or None on error + """ + cmd += " --json" + cmd_ret = self._run_amd_smi(cmd) + if cmd_ret: + try: + # Try to parse as single JSON first + return json.loads(cmd_ret) + except json.JSONDecodeError as e: + # try to extract and parse multiple JSON objects + try: + json_objects = [] + decoder = json.JSONDecoder() + idx = 0 + cmd_ret_stripped = cmd_ret.strip() + + while idx < len(cmd_ret_stripped): + while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace(): + idx += 1 + + if idx >= len(cmd_ret_stripped): + break + + if cmd_ret_stripped[idx] not in ["{", "["]: + break + + try: + obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx) + json_objects.append(obj) + idx = end_idx + except json.JSONDecodeError: + break + + if json_objects: + return json_objects if len(json_objects) > 1 else json_objects[0] + else: + raise + + except Exception: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error parsing command: `{cmd}` json data", + data={ + "cmd": cmd, + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + return None + + def _to_number(self, v: object) -> Optional[Union[int, float]]: + """Helper function to return number from str, float or "N/A" + + Args: + v (object): non number object + + Returns: + Optional[Union[int, float]]: number version of input + """ + if v in (None, "", "N/A"): + return None + try: + if isinstance(v, (int, float)): + return v + if isinstance(v, str): + s = v.strip() + try: + return int(s) + except Exception: + return float(s) + return float(str(v)) + except Exception: + return None + + def _valueunit(self, v: object, unit: str, *, required: bool = False) -> Optional[ValueUnit]: + """Build ValueUnit instance from object + + Args: + v (object): object to be turned into ValueUnit + unit (str): unit of measurement + required (bool, optional): bool to force instance creation. Defaults to False. + + Returns: + Optional[ValueUnit]: ValueUnit Instance + """ + n = self._to_number(v) + if n is None: + return ValueUnit(value=0, unit=unit) if required else None + return ValueUnit(value=n, unit=unit) + + def _valueunit_req(self, v: object, unit: str) -> ValueUnit: + """Helper function to force ValueUnit instance creation + + Args: + v (object): object + unit (str): unit of measurement + + Returns: + ValueUnit: instance of ValueUnit + """ + vu = self._valueunit(v, unit, required=True) + assert vu is not None + return vu + + def _normalize(self, val: object, default: str = "unknown", slot_type: bool = False) -> str: + """Normalize strings + + Args: + val (object): object + default (str, optional): default option. Defaults to "unknown". + slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. Defaults to False. + + Returns: + str: normalized string + """ + s = str(val).strip() if val is not None else "" + if not s or s.upper() == "N/A": + return "Unknown" if slot_type else default + + if slot_type: + u = s.upper().replace(" ", "").replace("-", "") + if u == "OAM": + return "OAM" + if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"): + return "PCIE" + if u == "CEM": + return "CEM" + return "Unknown" + + return s + + def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: + """Fill in information for AmdSmi data model + + Returns: + Optional[AmdSmiDataModel]: instance of the AmdSmi data model + """ + try: + version = self._get_amdsmi_version() + processes = self.get_process() + partition = self.get_partition() + firmware = self.get_firmware() + gpu_list = self.get_gpu_list() + statics = self.get_static() + cper_data = self.get_cper_data() + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi sub commands", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return None + + try: + return AmdSmiDataModel( + version=version, + gpu_list=gpu_list, + process=processes, + partition=partition, + firmware=firmware, + static=statics, + cper_data=cper_data, + ) + except ValidationError as err: + self.logger.warning("Validation err: %s", err) + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiDataModel", + data={"errors": err.errors(include_url=False)}, + priority=EventPriority.ERROR, + ) + return None + + def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: + """Get amdsmi version and data + + Returns: + Optional[AmdSmiVersion]: version information or None on error + """ + ret = self._run_amd_smi_dict(self.CMD_VERSION) + if not ret or not isinstance(ret, list) or len(ret) == 0: + return None + + version_data = ret[0] if isinstance(ret, list) else ret + if not isinstance(version_data, dict): + return None + + try: + return AmdSmiVersion( + tool="amdsmi", + version=version_data.get("amdsmi_library_version", ""), + amdsmi_library_version=version_data.get("amdsmi_library_version", ""), + rocm_version=version_data.get("rocm_version", ""), + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiVersion", + data={"errors": err.errors(include_url=False)}, + priority=EventPriority.WARNING, + ) + return None + + def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]: + """Get GPU information from amd-smi list command + + Returns: + Optional[list[AmdSmiListItem]]: list of GPU info items + """ + ret = self._run_amd_smi_dict(self.CMD_LIST) + if not ret: + return [] + + gpu_data = ret if isinstance(ret, list) else [ret] + out: list[AmdSmiListItem] = [] + + def _to_int(x: Any, default: int = 0) -> int: + try: + return int(x) + except Exception: + return default + + for item in gpu_data: + if not isinstance(item, dict): + continue + + try: + out.append( + AmdSmiListItem( + gpu=_to_int(item.get("gpu", 0)), + bdf=str(item.get("bdf", "")), + uuid=str(item.get("uuid", "")), + kfd_id=_to_int(item.get("kfd_id", 0)), + node_id=_to_int(item.get("node_id", 0)), + partition_id=_to_int(item.get("partition_id", 0)), + ) + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiListItem", + data={"errors": err.errors(include_url=False), "item": item}, + priority=EventPriority.WARNING, + ) + + return out + + def get_process(self) -> Optional[list[Processes]]: + """Get process information + + Returns: + Optional[list[Processes]]: list of GPU processes + """ + ret = self._run_amd_smi_dict(self.CMD_PROCESS) + if not ret: + return [] + + process_data = ret if isinstance(ret, list) else [ret] + out: list[Processes] = [] + + for item in process_data: + if not isinstance(item, dict): + continue + + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + process_list_raw = item.get("process_list", []) + if not isinstance(process_list_raw, list): + continue + + plist: list[ProcessListItem] = [] + + for entry in process_list_raw: + if not isinstance(entry, dict): + plist.append(ProcessListItem(process_info=str(entry))) + continue + + name = entry.get("name", "N/A") + pid_val = entry.get("pid", 0) + try: + pid = int(pid_val) if pid_val not in (None, "") else 0 + except Exception: + pid = 0 + + mem_vu = self._valueunit(entry.get("mem"), "B") + + mu = entry.get("memory_usage") or {} + mem_usage = ProcessMemoryUsage( + gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"), + cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"), + vram_mem=self._valueunit(mu.get("vram_mem"), "B"), + ) + + eu = entry.get("engine_usage") or {} + usage = ProcessUsage( + gfx=self._valueunit(eu.get("gfx"), "ns"), + enc=self._valueunit(eu.get("enc"), "ns"), + ) + + try: + plist.append( + ProcessListItem( + process_info=ProcessInfo( + name=str(name), + pid=pid, + memory_usage=mem_usage, + mem_usage=mem_vu, + usage=usage, + ) + ) + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build ProcessListItem; skipping entry", + data={ + "errors": err.errors(include_url=False), + "gpu_index": gpu_idx, + "entry": repr(entry), + }, + priority=EventPriority.WARNING, + ) + continue + + try: + out.append(Processes(gpu=gpu_idx, process_list=plist)) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Processes", + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, + priority=EventPriority.WARNING, + ) + + return out + + def get_partition(self) -> Optional[Partition]: + """Check partition information + + Returns: + Optional[Partition]: Partition data if available + """ + ret = self._run_amd_smi_dict(self.CMD_PARTITION) + if not ret: + return None + + partition_data = ret if isinstance(ret, list) else [ret] + memparts: list[PartitionMemory] = [] + computeparts: list[PartitionCompute] = [] + + # Flatten multi-JSON results (partition command returns multiple JSON arrays) + flattened_data = [] + for item in partition_data: + if isinstance(item, list): + flattened_data.extend(item) + elif isinstance(item, dict): + flattened_data.append(item) + + for item in flattened_data: + if not isinstance(item, dict): + continue + + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + mem_pt = item.get("memory_partition") + comp_pt = item.get("compute_partition") + + try: + memparts.append( + PartitionMemory(gpu_id=gpu_idx, partition_type=str(mem_pt) if mem_pt else None) + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build PartitionMemory", + data={ + "errors": err.errors(include_url=False), + "gpu_index": gpu_idx, + "data": mem_pt, + }, + priority=EventPriority.WARNING, + ) + + try: + computeparts.append( + PartitionCompute( + gpu_id=gpu_idx, partition_type=str(comp_pt) if comp_pt else None + ) + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build PartitionCompute", + data={ + "errors": err.errors(include_url=False), + "gpu_index": gpu_idx, + "data": comp_pt, + }, + priority=EventPriority.WARNING, + ) + + try: + return Partition(memory_partition=memparts, compute_partition=computeparts) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Partition", + data={"errors": err.errors(include_url=False)}, + priority=EventPriority.WARNING, + ) + return None + + def get_firmware(self) -> Optional[list[Fw]]: + """Get firmware information + + Returns: + Optional[list[Fw]]: List of firmware info per GPU + """ + ret = self._run_amd_smi_dict(self.CMD_FIRMWARE) + if not ret: + return [] + + firmware_data = ret if isinstance(ret, list) else [ret] + out: list[Fw] = [] + + for item in firmware_data: + if not isinstance(item, dict): + continue + + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + fw_list_raw = item.get("fw_list", []) + + if not isinstance(fw_list_raw, list): + continue + + normalized: list[FwListItem] = [] + for e in fw_list_raw: + if isinstance(e, dict): + fid = e.get("fw_name") + ver = e.get("fw_version") + normalized.append( + FwListItem( + fw_id="" if fid is None else str(fid), + fw_version="" if ver is None else str(ver), + ) + ) + else: + self._log_event( + category=EventCategory.APPLICATION, + description="Unrecognized firmware entry shape", + data={"entry_shape": repr(e)}, + priority=EventPriority.INFO, + ) + + try: + out.append(Fw(gpu=gpu_idx, fw_list=normalized)) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Fw", + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, + priority=EventPriority.WARNING, + ) + + return out + + def get_static(self) -> Optional[list[AmdSmiStatic]]: + """Get Static info from amd-smi static command + + Returns: + Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list + """ + ret = self._run_amd_smi_dict(self.CMD_STATIC) + if not ret: + self.logger.info("Bulk static query failed, attempting per-GPU fallback") + gpu_list = self.get_gpu_list() + if gpu_list: + fallback_data: list[dict] = [] + for gpu in gpu_list: + gpu_data = self._run_amd_smi_dict(self.CMD_STATIC_GPU.format(gpu_id=gpu.gpu)) + if gpu_data: + if isinstance(gpu_data, dict): + fallback_data.append(gpu_data) + elif isinstance(gpu_data, list): + fallback_data.extend(gpu_data) + if fallback_data: + ret = fallback_data + else: + return [] + else: + return [] + + if isinstance(ret, dict) and "gpu_data" in ret: + ret = ret["gpu_data"] + + static_data = ret if isinstance(ret, list) else [ret] + out: list[AmdSmiStatic] = [] + + for item in static_data: + if not isinstance(item, dict) or "gpu" not in item: + continue + + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + + asic = item.get("asic", {}) or {} + board = item.get("board", {}) or {} + bus = item.get("bus", {}) or {} + vbios = item.get("vbios", {}) or {} + driver = item.get("driver", {}) or {} + numa = item.get("numa", {}) or {} + vram = item.get("vram", {}) or {} + ras = item.get("ras", {}) or {} + cache = item.get("cache", {}) or {} + clock = item.get("clock", {}) or {} + soc_pstate = item.get("soc_pstate", {}) or {} + xgmi_plpd = item.get("xgmi_plpd", {}) or {} + + # Bus / PCIe + bus_model = StaticBus( + bdf=str(bus.get("bdf", "")), + max_pcie_width=self._valueunit(bus.get("max_pcie_width"), "x"), + max_pcie_speed=self._valueunit(bus.get("max_pcie_speed"), "GT/s"), + pcie_interface_version=self._normalize(bus.get("pcie_interface_version")), + slot_type=self._normalize(bus.get("slot_type"), slot_type=True), + ) + + # ASIC + oam_id_raw = asic.get("oam_id") + if oam_id_raw in (None, "", "N/A"): + oam_id_val: Union[int, str] = "N/A" + elif isinstance(oam_id_raw, str): + oam_id_val = oam_id_raw + else: + oam_id_val = int(oam_id_raw) if oam_id_raw is not None else "N/A" + + num_cu_raw = asic.get("num_compute_units") + if num_cu_raw in (None, "", "N/A"): + num_cu_val: Union[int, str] = "N/A" + elif isinstance(num_cu_raw, str): + num_cu_val = num_cu_raw + else: + num_cu_val = int(num_cu_raw) if num_cu_raw is not None else "N/A" + + asic_model = StaticAsic( + market_name=self._normalize( + asic.get("market_name") or asic.get("asic_name"), default="" + ), + vendor_id=str(asic.get("vendor_id", "")), + vendor_name=str(asic.get("vendor_name", "")), + subvendor_id=str(asic.get("subvendor_id", "")), + device_id=str(asic.get("device_id", "")), + subsystem_id=str(asic.get("subsystem_id", "")), + rev_id=str(asic.get("rev_id", "")), + asic_serial=str(asic.get("asic_serial", "")), + oam_id=oam_id_val, + num_compute_units=num_cu_val, + target_graphics_version=str(asic.get("target_graphics_version", "")), + ) + + # Board + board_model = StaticBoard( + model_number=str( + board.get("model_number", "") or board.get("amdsmi_model_number", "") + ), + product_serial=str(board.get("product_serial", "")), + fru_id=str(board.get("fru_id", "")), + product_name=str(board.get("product_name", "")), + manufacturer_name=str(board.get("manufacturer_name", "")), + ) + + # Driver + driver_model = StaticDriver( + name=self._normalize( + driver.get("driver_name") if driver else None, default="unknown" + ), + version=self._normalize( + driver.get("driver_version") if driver else None, default="unknown" + ), + ) + + # VBIOS + vbios_model: Optional[StaticVbios] = None + if vbios: + vbios_model = StaticVbios( + name=str(vbios.get("vbios_name", "")), + build_date=str(vbios.get("vbios_build_date", "")), + part_number=str(vbios.get("vbios_part_number", "")), + version=str(vbios.get("vbios_version", "")), + ) + + # NUMA + numa_node = int(numa.get("node", 0) or 0) + affinity_raw = numa.get("affinity") + if affinity_raw in (None, "", "N/A"): + affinity_val: Union[int, str] = "N/A" + elif isinstance(affinity_raw, str): + affinity_val = affinity_raw + else: + affinity_val = int(affinity_raw) if affinity_raw is not None else "N/A" + + numa_model = StaticNuma(node=numa_node, affinity=affinity_val) + + # VRAM + vram_type = str(vram.get("vram_type", "") or "unknown") + vram_vendor = vram.get("vram_vendor") + vram_bits = vram.get("vram_bit_width") + vram_size_b: Optional[int] = None + if vram.get("vram_size_mb") is not None: + try: + vram_size_b = int(vram["vram_size_mb"]) * 1024 * 1024 + except Exception: + vram_size_b = None + + vram_model = StaticVram( + type=vram_type, + vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor), + size=self._valueunit(vram_size_b, "B"), + bit_width=self._valueunit(vram_bits, "bit"), + max_bandwidth=None, + ) + + # SOC P-state + soc_pstate_model = self._parse_soc_pstate(soc_pstate) + + # XGMI PLPD + xgmi_plpd_model = self._parse_xgmi_plpd(xgmi_plpd) + + # RAS + ras_model = self._parse_ras(ras) + + # Cache info + cache_info_model = self._parse_cache_info(cache) + + # Clock + clock_dict_model = self._parse_clock_dict(clock) + + try: + out.append( + AmdSmiStatic( + gpu=gpu_idx, + asic=asic_model, + bus=bus_model, + vbios=vbios_model, + limit=None, + driver=driver_model, + board=board_model, + ras=ras_model, + soc_pstate=soc_pstate_model, + xgmi_plpd=xgmi_plpd_model, + process_isolation="", + numa=numa_model, + vram=vram_model, + cache_info=cache_info_model, + partition=None, + clock=clock_dict_model, + ) + ) + except ValidationError as err: + self.logger.error(err) + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiStatic", + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, + priority=EventPriority.WARNING, + ) + + return out + + def _parse_soc_pstate(self, data: dict) -> Optional[StaticSocPstate]: + """Parse SOC P-state data + + Args: + data (dict): SOC P-state data from amd-smi + + Returns: + Optional[StaticSocPstate]: StaticSocPstate instance or None + """ + if not isinstance(data, dict): + return None + + try: + num_supported = int(data.get("num_supported", 0) or 0) + except Exception: + num_supported = 0 + try: + current_id = int(data.get("current_id", 0) or 0) + except Exception: + current_id = 0 + + policies_raw = data.get("policies") or [] + policies: list[StaticPolicy] = [] + if isinstance(policies_raw, list): + for p in policies_raw: + if not isinstance(p, dict): + continue + pid = p.get("policy_id", 0) + desc = p.get("policy_description", "") + try: + policies.append( + StaticPolicy( + policy_id=int(pid) if pid not in (None, "") else 0, + policy_description=str(desc), + ) + ) + except ValidationError: + continue + + if not num_supported and not current_id and not policies: + return None + + try: + return StaticSocPstate( + num_supported=num_supported, + current_id=current_id, + policies=policies, + ) + except ValidationError: + return None + + def _parse_xgmi_plpd(self, data: dict) -> Optional[StaticXgmiPlpd]: + """Parse XGMI PLPD data + + Args: + data (dict): XGMI PLPD data from amd-smi + + Returns: + Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None + """ + if not isinstance(data, dict): + return None + + try: + num_supported = int(data.get("num_supported", 0) or 0) + except Exception: + num_supported = 0 + try: + current_id = int(data.get("current_id", 0) or 0) + except Exception: + current_id = 0 + + plpds_raw = data.get("plpds") or [] + plpds: list[StaticPolicy] = [] + if isinstance(plpds_raw, list): + for p in plpds_raw: + if not isinstance(p, dict): + continue + pid = p.get("policy_id", 0) + desc = p.get("policy_description", "") + try: + plpds.append( + StaticPolicy( + policy_id=int(pid) if pid not in (None, "") else 0, + policy_description=str(desc), + ) + ) + except ValidationError: + continue + + if not num_supported and not current_id and not plpds: + return None + + try: + return StaticXgmiPlpd( + num_supported=num_supported, + current_id=current_id, + plpds=plpds, + ) + except ValidationError: + return None + + def _parse_ras(self, data: dict) -> StaticRas: + """Parse RAS/ECC data + + Args: + data (dict): RAS data from amd-smi + + Returns: + StaticRas: StaticRas instance with default values if data is missing + """ + if not isinstance(data, dict): + # Return default RAS data + return StaticRas( + eeprom_version="N/A", + parity_schema=EccState.NA, + single_bit_schema=EccState.NA, + double_bit_schema=EccState.NA, + poison_schema=EccState.NA, + ecc_block_state={}, + ) + + def _to_ecc_state(value: Any) -> EccState: + """Convert string to EccState enum""" + if not value or not isinstance(value, str): + return EccState.NA + try: + return EccState(value.upper()) + except (ValueError, AttributeError): + return EccState.NA + + eeprom_version = str(data.get("eeprom_version", "N/A") or "N/A") + parity_schema = _to_ecc_state(data.get("parity_schema")) + single_bit_schema = _to_ecc_state(data.get("single_bit_schema")) + double_bit_schema = _to_ecc_state(data.get("double_bit_schema")) + poison_schema = _to_ecc_state(data.get("poison_schema")) + + ecc_block_state = data.get("ecc_block_state", {}) + ecc_block_state_final: Union[Dict[str, EccState], str] + if isinstance(ecc_block_state, dict): + parsed_blocks = {} + for block_name, block_state in ecc_block_state.items(): + parsed_blocks[block_name] = _to_ecc_state(block_state) + ecc_block_state_final = parsed_blocks + elif isinstance(ecc_block_state, str): + ecc_block_state_final = ecc_block_state + else: + ecc_block_state_final = {} + + try: + return StaticRas( + eeprom_version=eeprom_version, + parity_schema=parity_schema, + single_bit_schema=single_bit_schema, + double_bit_schema=double_bit_schema, + poison_schema=poison_schema, + ecc_block_state=ecc_block_state_final, + ) + except ValidationError: + # Return default if validation fails + return StaticRas( + eeprom_version="N/A", + parity_schema=EccState.NA, + single_bit_schema=EccState.NA, + double_bit_schema=EccState.NA, + poison_schema=EccState.NA, + ecc_block_state={}, + ) + + def _parse_cache_info(self, data: dict) -> list[StaticCacheInfoItem]: + """Parse cache info data + + Args: + data (dict): Cache data from amd-smi + + Returns: + list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances + """ + if not isinstance(data, dict) or not isinstance(data.get("cache"), list): + return [] + + items = data["cache"] + + def _as_list_str(v: Any) -> list[str]: + if isinstance(v, list): + return [str(x) for x in v] + if isinstance(v, str): + parts = [p.strip() for p in v.replace(";", ",").split(",")] + return [p for p in parts if p] + return [] + + out: list[StaticCacheInfoItem] = [] + for e in items: + if not isinstance(e, dict): + continue + + cache_level = self._valueunit_req(e.get("cache_level"), "") + max_num_cu_shared = self._valueunit_req(e.get("max_num_cu_shared"), "") + num_cache_instance = self._valueunit_req(e.get("num_cache_instance"), "") + cache_size = self._valueunit(e.get("cache_size"), "", required=False) + cache_props = _as_list_str(e.get("cache_properties")) + + lvl_val = cache_level.value + cache_label_val = ( + f"Label_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}" + ) + cache_label = ValueUnit(value=cache_label_val, unit="") + + try: + out.append( + StaticCacheInfoItem( + cache=cache_label, + cache_properties=cache_props, + cache_size=cache_size, + cache_level=cache_level, + max_num_cu_shared=max_num_cu_shared, + num_cache_instance=num_cache_instance, + ) + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Bad cache info entry from amd-smi; skipping", + data={"entry": repr(e), "errors": err.errors(include_url=False)}, + priority=EventPriority.WARNING, + ) + continue + + return out + + def _parse_clock(self, data: dict) -> Optional[StaticClockData]: + """Parse clock data + + Args: + data (dict): Clock data from amd-smi + + Returns: + Optional[StaticClockData]: StaticClockData instance or None + """ + if not isinstance(data, dict): + return None + + freqs_raw = data.get("frequency") + if not isinstance(freqs_raw, list) or not freqs_raw: + return None + + def _to_mhz(v: object) -> Optional[int]: + x = self._to_number(v) + if x is None: + return None + xf = float(x) + if xf >= 1e7: + return int(round(xf / 1_000_000.0)) + if xf >= 1e4: + return int(round(xf / 1_000.0)) + return int(round(xf)) + + freqs_mhz: list[int] = [] + for v in freqs_raw: + mhz = _to_mhz(v) + if mhz is not None: + freqs_mhz.append(mhz) + + if not freqs_mhz: + return None + + def _fmt(n: Optional[int]) -> Optional[str]: + return None if n is None else f"{n} MHz" + + level0: str = _fmt(freqs_mhz[0]) or "0 MHz" + level1: Optional[str] = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None + level2: Optional[str] = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None + + cur_raw = data.get("current") + current: Optional[int] + if isinstance(cur_raw, (int, float)): + current = int(cur_raw) + elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A": + try: + current = int(cur_raw.strip()) + except Exception: + current = None + else: + current = None + + try: + levels = StaticFrequencyLevels.model_validate( + {"Level 0": level0, "Level 1": level1, "Level 2": level2} + ) + + # Use the alias "current level" as defined in the model + return StaticClockData.model_validate( + {"frequency_levels": levels, "current level": current} + ) + except ValidationError: + return None + + def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockData, None]]]: + """Parse clock data into dictionary structure + + Args: + data (dict): Clock data from amd-smi + + Returns: + Optional[dict[str, Union[StaticClockData, None]]]: dictionary of clock data or None + """ + if not isinstance(data, dict): + return None + + clock_dict: dict[str, Union[StaticClockData, None]] = {} + + clock_data = self._parse_clock(data) + if clock_data: + clock_dict["clk"] = clock_data + + return clock_dict if clock_dict else None + + def get_cper_data(self) -> List[FileModel]: + """Collect CPER data from amd-smi ras command + + Returns: + list[FileModel]: List of CPER files or empty list if not supported/available + """ + try: + AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" + # Ensure the cper folder exists but is empty + self._run_sut_cmd( + f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", + sudo=False, + ) + # Run amd-smi ras command with sudo to collect CPER data + cper_cmd_ret = self._run_sut_cmd( + f"{self.AMD_SMI_EXE} {self.CMD_RAS.format(folder=AMD_SMI_CPER_FOLDER)}", + sudo=True, + ) + if cper_cmd_ret.exit_code != 0: + # Command failed, return empty list + return [] + cper_cmd = cper_cmd_ret.stdout + # search that a CPER is actually created here + regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) + if not regex_cper_search: + # Early exit if no CPER files were created + return [] + # tar the cper folder + self._run_sut_cmd( + f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", + sudo=True, + ) + # Load the tar files + cper_zip = self._read_sut_file( + f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False, log_artifact=True + ) + # Since encoding=None, this returns BinaryFileArtifact which has contents: bytes + if hasattr(cper_zip, "contents"): + io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined] + else: + return [] + del cper_zip # Free memory after reading the file + try: + with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: + cper_data = [] + for member in tar_file.getmembers(): + if member.isfile() and member.name.endswith(".cper"): + file_content = tar_file.extractfile(member) + if file_content is not None: + # Decode the content, ignoring errors to avoid issues with binary data + # that may not be valid UTF-8 + file_content_bytes = file_content.read() + else: + file_content_bytes = b"" + cper_data.append( + FileModel(file_contents=file_content_bytes, file_name=member.name) + ) + # Since we do not log the cper data in the data model create an event informing the user if CPER created + if cper_data: + self._log_event( + category=EventCategory.APPLICATION, + description="CPER data has been extracted from amd-smi", + data={ + "cper_count": len(cper_data), + }, + priority=EventPriority.INFO, + ) + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error extracting cper data", + data={ + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return [] + return cper_data + except Exception as e: + # If any unexpected error occurs during CPER collection, log it and return empty list + # This ensures CPER collection failures don't break the entire data collection + self._log_event( + category=EventCategory.APPLICATION, + description="Error collecting CPER data", + data={ + "exception": get_exception_traceback(e), + }, + priority=EventPriority.WARNING, + console_log=False, + ) + return [] + + def collect_data( + self, + args: Any = None, + ) -> tuple[TaskResult, Optional[AmdSmiDataModel]]: + """Collect AmdSmi data from system + + Args: + args (Any, optional): optional arguments for data collection. Defaults to None. + + Returns: + tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model + """ + + if not self._check_amdsmi_installed(): + self._log_event( + category=EventCategory.APPLICATION, + description="amd-smi is not installed", + priority=EventPriority.WARNING, + console_log=True, + ) + self.result.status = ExecutionStatus.NOT_RAN + return self.result, None + + try: + version = self._get_amdsmi_version() + if version is not None: + self.logger.info("amd-smi version: %s", version.version) + self.logger.info("ROCm version: %s", version.rocm_version) + + amd_smi_data = self._get_amdsmi_data() + + if amd_smi_data is None: + return self.result, None + + return self.result, amd_smi_data + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi collector", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return self.result, None diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py new file mode 100644 index 00000000..67eda944 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -0,0 +1,43 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .amdsmi_analyzer import AmdSmiAnalyzer +from .amdsmi_collector import AmdSmiCollector +from .amdsmidata import AmdSmiDataModel +from .analyzer_args import AmdSmiAnalyzerArgs + + +class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, AmdSmiAnalyzerArgs]): + """Plugin for collection and analysis of amdsmi data""" + + DATA_MODEL = AmdSmiDataModel + + COLLECTOR = AmdSmiCollector + + ANALYZER = AmdSmiAnalyzer + + ANALYZER_ARGS = AmdSmiAnalyzerArgs diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py new file mode 100644 index 00000000..aacca2ac --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -0,0 +1,1002 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from enum import Enum +from typing import Any, Mapping, Optional, Union + +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + computed_field, + field_validator, + model_validator, +) + +from nodescraper.models.datamodel import DataModel, FileModel +from nodescraper.utils import find_annotation_in_container + +_NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$") + + +def na_to_none(values: Union[int, str]): + if values == "N/A": + return None + return values + + +def na_to_none_list(values: list[Union[int, str, None]]) -> list[Union[int, str, None]]: + ret_list: list[Union[int, str, None]] = values.copy() + for i in range(len(ret_list)): + if ret_list[i] == "N/A": + ret_list[i] = None + return ret_list + + +def na_to_none_dict(values: object) -> Optional[dict[str, Any]]: + """Normalize mapping-like fields where 'N/A' or empty should become None. + Accepts None; returns None for 'N/A'/'NA'/'' or non-mapping inputs.""" + if values is None: + return None + if isinstance(values, str) and values.strip().upper() in {"N/A", "NA", ""}: + return None + if not isinstance(values, Mapping): + return None + + out: dict[str, Any] = {} + for k, v in values.items(): + if isinstance(v, str) and v.strip().upper() in {"N/A", "NA", ""}: + out[k] = None + else: + out[k] = v + return out + + +class AmdSmiBaseModel(BaseModel): + """Base model for AMD SMI data models. + + This is used to ensure that all AMD SMI data models have the same + configuration and validation. + """ + + model_config = ConfigDict( + str_min_length=1, + str_strip_whitespace=True, + populate_by_name=True, + extra="forbid", # Forbid extra fields not defined in the model + ) + + def __init__(self, **data): + # Convert Union[int, str, float] -> ValueUnit + for field_name, field_type in self.__class__.model_fields.items(): + annotation = field_type.annotation + target_type, container = find_annotation_in_container(annotation, ValueUnit) + if target_type is None: + continue + + if field_name in data and isinstance(data[field_name], (int, str, float)): + # If the field is a primitive type, convert it to ValueUnit dict for validator + data[field_name] = { + "value": data[field_name], + "unit": "", + } + + super().__init__(**data) + + +class ValueUnit(BaseModel): + """A model for a value with a unit. + + Accepts: + - dict: {"value": 123, "unit": "W"} + - number: 123 -> unit="" + - string with number+unit: "123 W" -> {"value": 123, "unit": "W"} + - "N/A" / "NA" / "" / None -> None + """ + + value: Union[int, float, str] + unit: str = "" + + @model_validator(mode="before") + @classmethod + def _coerce(cls, v): + # treat N/A as None + def na(x) -> bool: + return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""}) + + if na(v): + return None + + if isinstance(v, dict): + val = v.get("value") + unit = v.get("unit", "") + if na(val): + return None + if isinstance(val, str): + m = _NUM_UNIT_RE.match(val.strip()) + if m and not unit: + num, u = m.groups() + unit = u or unit or "" + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit} + + # numbers + if isinstance(v, (int, float)): + return {"value": v, "unit": ""} + + if isinstance(v, str): + s = v.strip() + m = _NUM_UNIT_RE.match(s) + if m: + num, unit = m.groups() + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit or ""} + return {"value": s, "unit": ""} + + return v + + @field_validator("unit") + @classmethod + def _clean_unit(cls, u): + return "" if u is None else str(u).strip() + + +# Process +class ProcessMemoryUsage(BaseModel): + gtt_mem: Optional[ValueUnit] + cpu_mem: Optional[ValueUnit] + vram_mem: Optional[ValueUnit] + + na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none) + + +class ProcessUsage(BaseModel): + # AMDSMI reports engine usage in nanoseconds + gfx: Optional[ValueUnit] + enc: Optional[ValueUnit] + na_validator = field_validator("gfx", "enc", mode="before")(na_to_none) + + +class ProcessInfo(BaseModel): + name: str + pid: int + memory_usage: ProcessMemoryUsage + mem_usage: Optional[ValueUnit] + usage: ProcessUsage + na_validator = field_validator("mem_usage", mode="before")(na_to_none) + + +class EccState(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + NONE = "NONE" + PARITY = "PARITY" + SING_C = "SING_C" + MULT_UC = "MULT_UC" + POISON = "POISON" + NA = "N/A" + + +class ProcessListItem(BaseModel): + process_info: Union[ProcessInfo, str] + + +class Processes(BaseModel): + gpu: int + process_list: list[ProcessListItem] + + +# FW +class FwListItem(BaseModel): + fw_id: str + fw_version: str + + +class Fw(BaseModel): + gpu: int + fw_list: Union[list[FwListItem], str] + + +class AmdSmiListItem(BaseModel): + gpu: int + bdf: str + uuid: str + kfd_id: int + node_id: int + partition_id: int + + +class AmdSmiVersion(BaseModel): + """Contains the versioning info for amd-smi""" + + tool: Optional[str] = None + version: Optional[str] = None + amdsmi_library_version: Optional[str] = None + rocm_version: Optional[str] = None + amdgpu_version: Optional[str] = None + amd_hsmp_driver_version: Optional[str] = None + + @field_validator("*", mode="before") + @classmethod + def _stringify(cls, v): + if v is None or isinstance(v, str): + return v + if isinstance(v, (bytes, bytearray)): + return v.decode("utf-8", "ignore") + if isinstance(v, (tuple, list)): + return ".".join(str(x) for x in v) + return str(v) + + +class PartitionAccelerator(BaseModel): + """Accelerator partition data""" + + gpu_id: int + memory: Optional[str] = None + accelerator_type: Optional[str] = None + accelerator_profile_index: Optional[Union[str, int]] = None + partition_id: Optional[int] = None + + +class PartitionMemory(BaseModel): + """Memory Partition data""" + + gpu_id: int + partition_type: Optional[str] = None + + +class PartitionCompute(BaseModel): + """Compute Partition data""" + + gpu_id: int + partition_type: Optional[str] = None + + +class Partition(BaseModel): + """Contains the partition info for amd-smi""" + + memory_partition: list[PartitionMemory] = Field(default_factory=list) + compute_partition: list[PartitionCompute] = Field(default_factory=list) + + +### STATIC DATA ### +class StaticAsic(BaseModel): + market_name: str + vendor_id: str + vendor_name: str + subvendor_id: str + device_id: str + subsystem_id: str + rev_id: str + asic_serial: str + oam_id: Union[int, str] # can be N/A + num_compute_units: Union[int, str] # can be N/A + target_graphics_version: str + + +class StaticBus(AmdSmiBaseModel): + bdf: str + max_pcie_width: Optional[ValueUnit] = None + max_pcie_speed: Optional[ValueUnit] = None + pcie_interface_version: str = "unknown" + slot_type: str = "unknown" + + +class StaticVbios(BaseModel): + name: str + build_date: str + part_number: str + version: str + + +class StaticLimit(AmdSmiBaseModel): + max_power: Optional[ValueUnit] = None + min_power: Optional[ValueUnit] = None + socket_power: Optional[ValueUnit] = None + slowdown_edge_temperature: Optional[ValueUnit] = None + slowdown_hotspot_temperature: Optional[ValueUnit] = None + slowdown_vram_temperature: Optional[ValueUnit] = None + shutdown_edge_temperature: Optional[ValueUnit] = None + shutdown_hotspot_temperature: Optional[ValueUnit] = None + shutdown_vram_temperature: Optional[ValueUnit] = None + na_validator = field_validator( + "max_power", + "min_power", + "socket_power", + "slowdown_edge_temperature", + "slowdown_hotspot_temperature", + "slowdown_vram_temperature", + "shutdown_edge_temperature", + "shutdown_hotspot_temperature", + "shutdown_vram_temperature", + mode="before", + )(na_to_none) + + +class StaticDriver(BaseModel): + name: str + version: str + + +class StaticBoard(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + amdsmi_model_number: str = Field( + alias="model_number" + ) # Model number is a reserved keyword for pydantic + product_serial: str + fru_id: str + product_name: str + manufacturer_name: str + + +class StaticRas(BaseModel): + eeprom_version: str + parity_schema: EccState + single_bit_schema: EccState + double_bit_schema: EccState + poison_schema: EccState + ecc_block_state: Union[dict[str, EccState], str] + + +class StaticPartition(BaseModel): + # The name for compute_partition has changed we will support both for now + + compute_partition: str = Field( + validation_alias=AliasChoices("compute_partition", "accelerator_partition") + ) + memory_partition: str + partition_id: int + + +class StaticPolicy(BaseModel): + policy_id: int + policy_description: str + + +class StaticSocPstate(BaseModel): + num_supported: int + current_id: int + policies: list[StaticPolicy] + + +class StaticXgmiPlpd(BaseModel): + num_supported: int + current_id: int + plpds: list[StaticPolicy] + + +class StaticNuma(BaseModel): + node: int + affinity: Union[int, str] # can be N/A + + +class StaticVram(AmdSmiBaseModel): + type: str + vendor: Optional[str] + size: Optional[ValueUnit] + bit_width: Optional[ValueUnit] + max_bandwidth: Optional[ValueUnit] = None + na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")( + na_to_none + ) + + +class StaticCacheInfoItem(AmdSmiBaseModel): + cache: ValueUnit + cache_properties: list[str] + cache_size: Optional[ValueUnit] + cache_level: ValueUnit + max_num_cu_shared: ValueUnit + num_cache_instance: ValueUnit + na_validator = field_validator("cache_size", mode="before")(na_to_none) + + +class StaticFrequencyLevels(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + Level_0: str = Field(..., alias="Level 0") + Level_1: Optional[str] = Field(default=None, alias="Level 1") + Level_2: Optional[str] = Field(default=None, alias="Level 2") + + +class StaticClockData(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + frequency_levels: StaticFrequencyLevels + + current_level: Optional[int] = Field(..., alias="current level") + na_validator = field_validator("current_level", mode="before")(na_to_none) + + +class AmdSmiStatic(BaseModel): + """Contains all static data""" + + gpu: int + asic: StaticAsic + bus: StaticBus + vbios: Optional[StaticVbios] + limit: Optional[StaticLimit] + driver: StaticDriver + board: StaticBoard + ras: StaticRas + soc_pstate: Optional[StaticSocPstate] + xgmi_plpd: Optional[StaticXgmiPlpd] + process_isolation: str + numa: StaticNuma + vram: StaticVram + cache_info: list[StaticCacheInfoItem] + partition: Optional[StaticPartition] = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ + clock: Optional[dict[str, Union[StaticClockData, None]]] = None + na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) + na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( + na_to_none + ) + + +# PAGES +class PageData(BaseModel): + page_address: Union[int, str] + page_size: Union[int, str] + status: str + value: Optional[int] + + +class BadPages(BaseModel): + gpu: int + retired: list[PageData] + + +# Metric Data +class MetricUsage(BaseModel): + gfx_activity: Optional[ValueUnit] + umc_activity: Optional[ValueUnit] + mm_activity: Optional[ValueUnit] + vcn_activity: list[Optional[Union[ValueUnit, str]]] + jpeg_activity: list[Optional[Union[ValueUnit, str]]] + gfx_busy_inst: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]] + jpeg_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]] + vcn_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]] + na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")( + na_to_none_list + ) + na_validator = field_validator( + "gfx_activity", + "umc_activity", + "mm_activity", + "gfx_busy_inst", + "jpeg_busy", + "vcn_busy", + mode="before", + )(na_to_none) + + +class MetricPower(BaseModel): + socket_power: Optional[ValueUnit] + gfx_voltage: Optional[ValueUnit] + soc_voltage: Optional[ValueUnit] + mem_voltage: Optional[ValueUnit] + throttle_status: Optional[str] + power_management: Optional[str] + na_validator = field_validator( + "socket_power", + "gfx_voltage", + "soc_voltage", + "mem_voltage", + "throttle_status", + "power_management", + mode="before", + )(na_to_none) + + +class MetricClockData(BaseModel): + clk: Optional[ValueUnit] + min_clk: Optional[ValueUnit] + max_clk: Optional[ValueUnit] + clk_locked: Optional[Union[int, str, dict]] + deep_sleep: Optional[Union[int, str, dict]] + na_validator = field_validator( + "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before" + )(na_to_none) + + +class MetricTemperature(BaseModel): + edge: Optional[ValueUnit] + hotspot: Optional[ValueUnit] + mem: Optional[ValueUnit] + na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none) + + +class MetricPcie(BaseModel): + width: Optional[int] + speed: Optional[ValueUnit] + bandwidth: Optional[ValueUnit] + replay_count: Optional[int] + l0_to_recovery_count: Optional[int] + replay_roll_over_count: Optional[int] + nak_sent_count: Optional[int] + nak_received_count: Optional[int] + current_bandwidth_sent: Optional[int] + current_bandwidth_received: Optional[int] + max_packet_size: Optional[int] + lc_perf_other_end_recovery: Optional[int] + na_validator = field_validator( + "width", + "speed", + "bandwidth", + "replay_count", + "l0_to_recovery_count", + "replay_roll_over_count", + "nak_sent_count", + "nak_received_count", + "current_bandwidth_sent", + "current_bandwidth_received", + "max_packet_size", + "lc_perf_other_end_recovery", + mode="before", + )(na_to_none) + + +class MetricEccTotals(BaseModel): + total_correctable_count: Optional[int] + total_uncorrectable_count: Optional[int] + total_deferred_count: Optional[int] + cache_correctable_count: Optional[int] + cache_uncorrectable_count: Optional[int] + na_validator = field_validator( + "total_correctable_count", + "total_uncorrectable_count", + "total_deferred_count", + "cache_correctable_count", + "cache_uncorrectable_count", + mode="before", + )(na_to_none) + + +class MetricErrorCounts(BaseModel): + correctable_count: Optional[str] + uncorrectable_count: Optional[str] + deferred_count: Optional[str] + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class MetricFan(BaseModel): + speed: Optional[ValueUnit] + max: Optional[ValueUnit] + rpm: Optional[ValueUnit] + usage: Optional[ValueUnit] + na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none) + + +class MetricVoltageCurve(BaseModel): + point_0_frequency: Optional[ValueUnit] + point_0_voltage: Optional[ValueUnit] + point_1_frequency: Optional[ValueUnit] + point_1_voltage: Optional[ValueUnit] + point_2_frequency: Optional[ValueUnit] + point_2_voltage: Optional[ValueUnit] + + na_validator = field_validator( + "point_0_frequency", + "point_0_voltage", + "point_1_frequency", + "point_1_voltage", + "point_2_frequency", + "point_2_voltage", + mode="before", + )(na_to_none) + + +class MetricEnergy(BaseModel): + total_energy_consumption: Optional[ValueUnit] + na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none) + + +class MetricMemUsage(BaseModel): + total_vram: Optional[ValueUnit] + used_vram: Optional[ValueUnit] + free_vram: Optional[ValueUnit] + total_visible_vram: Optional[ValueUnit] + used_visible_vram: Optional[ValueUnit] + free_visible_vram: Optional[ValueUnit] + total_gtt: Optional[ValueUnit] + used_gtt: Optional[ValueUnit] + free_gtt: Optional[ValueUnit] + na_validator = field_validator( + "total_vram", + "used_vram", + "free_vram", + "total_visible_vram", + "used_visible_vram", + "free_visible_vram", + "total_gtt", + "used_gtt", + "free_gtt", + mode="before", + )(na_to_none) + + +class MetricThrottleVu(BaseModel): + xcp_0: Optional[list[Optional[Union[ValueUnit, str]]]] = None + # Deprecated below + value: Optional[dict[str, list[Union[int, str]]]] = Field(deprecated=True, default=None) + unit: str = Field(deprecated=True, default="") + + +class MetricThrottle(AmdSmiBaseModel): + accumulation_counter: Optional[Union[MetricThrottleVu, ValueUnit]] = None + + gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + gfx_clk_below_host_limit_power_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + gfx_clk_below_host_limit_power_violation_activity: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + gfx_clk_below_host_limit_power_violation_status: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + gfx_clk_below_host_limit_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + gfx_clk_below_host_limit_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = ( + None + ) + gfx_clk_below_host_limit_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + gfx_clk_below_host_limit_thermal_violation_accumulated: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + gfx_clk_below_host_limit_thermal_violation_activity: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + gfx_clk_below_host_limit_thermal_violation_status: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + gfx_clk_below_host_limit_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = ( + None + ) + + hbm_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + hbm_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + hbm_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + low_utilization_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + low_utilization_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + low_utilization_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + ppt_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + ppt_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + ppt_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + prochot_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + prochot_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + prochot_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + socket_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + socket_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + socket_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + vr_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + vr_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None + vr_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None + + total_gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + low_utilization_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None + total_gfx_clk_below_host_limit_violation_status: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + total_gfx_clk_below_host_limit_violation_activity: Optional[ + Union[MetricThrottleVu, ValueUnit] + ] = None + + na_validator = field_validator( + "accumulation_counter", + "gfx_clk_below_host_limit_accumulated", + "gfx_clk_below_host_limit_power_accumulated", + "gfx_clk_below_host_limit_power_violation_activity", + "gfx_clk_below_host_limit_power_violation_status", + "gfx_clk_below_host_limit_violation_activity", + "gfx_clk_below_host_limit_violation_accumulated", + "gfx_clk_below_host_limit_violation_status", + "gfx_clk_below_host_limit_thermal_violation_accumulated", + "gfx_clk_below_host_limit_thermal_violation_activity", + "gfx_clk_below_host_limit_thermal_violation_status", + "gfx_clk_below_host_limit_thermal_accumulated", + "hbm_thermal_accumulated", + "hbm_thermal_violation_activity", + "hbm_thermal_violation_status", + "low_utilization_violation_accumulated", + "low_utilization_violation_activity", + "low_utilization_violation_status", + "ppt_accumulated", + "ppt_violation_activity", + "ppt_violation_status", + "prochot_accumulated", + "prochot_violation_activity", + "prochot_violation_status", + "socket_thermal_accumulated", + "socket_thermal_violation_activity", + "socket_thermal_violation_status", + "vr_thermal_accumulated", + "vr_thermal_violation_activity", + "vr_thermal_violation_status", + "total_gfx_clk_below_host_limit_accumulated", + "low_utilization_accumulated", + "total_gfx_clk_below_host_limit_violation_status", + "total_gfx_clk_below_host_limit_violation_activity", + mode="before", + )(na_to_none) + + +class EccData(BaseModel): + "ECC counts collected per ecc block" + + correctable_count: Optional[int] = 0 + uncorrectable_count: Optional[int] = 0 + deferred_count: Optional[int] = 0 + + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class AmdSmiMetric(BaseModel): + gpu: int + usage: MetricUsage + power: MetricPower + clock: dict[str, MetricClockData] + temperature: MetricTemperature + pcie: MetricPcie + ecc: MetricEccTotals + ecc_blocks: Union[dict[str, EccData], str] + fan: MetricFan + voltage_curve: Optional[MetricVoltageCurve] + perf_level: Optional[Union[str, dict]] + xgmi_err: Optional[Union[str, dict]] + energy: Optional[MetricEnergy] + mem_usage: MetricMemUsage + throttle: MetricThrottle + + na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) + + @field_validator("ecc_blocks", mode="before") + @classmethod + def validate_ecc_blocks(cls, value: Union[dict[str, EccData], str]) -> dict[str, EccData]: + """Validate the ecc_blocks field.""" + if isinstance(value, str): + # If it's a string, we assume it's "N/A" and return an empty dict + return {} + return value + + @field_validator("energy", mode="before") + @classmethod + def validate_energy(cls, value: Optional[Any]) -> Optional[MetricEnergy]: + """Validate the energy field.""" + if value == "N/A" or value is None: + return None + return value + + +### LINK DATA ### + + +class LinkStatusTable(Enum): + UP = "U" + DOWN = "D" + DISABLED = "X" + + +class BiDirectionalTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class DmaTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class AtomicsTable(Enum): + SELF = "SELF" + TRUE = "64,32" + THIRTY_TWO = "32" + SIXTY_FOUR = "64" + + +class LinkTypes(Enum): + XGMI = "XGMI" + PCIE = "PCIE" + SELF = "SELF" + + +class AccessTable(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + + +# XGMI +class XgmiLink(BaseModel): + gpu: int + bdf: str + read: Optional[ValueUnit] + write: Optional[ValueUnit] + na_validator = field_validator("read", "write", mode="before")(na_to_none) + + +class XgmiLinkMetrics(BaseModel): + bit_rate: Optional[ValueUnit] + max_bandwidth: Optional[ValueUnit] + link_type: str + links: list[XgmiLink] + na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none) + + +class XgmiMetrics(BaseModel): + gpu: int + bdf: str + link_metrics: XgmiLinkMetrics + + +class XgmiLinks(BaseModel): + gpu: int + bdf: str + link_status: list[LinkStatusTable] + + +class CoherentTable(Enum): + COHERANT = "C" + NON_COHERANT = "NC" + SELF = "SELF" + + +# TOPO + + +class TopoLink(BaseModel): + gpu: int + bdf: str + weight: int + link_status: AccessTable + link_type: LinkTypes + num_hops: int + bandwidth: str + # The below fields are sometimes missing, so we use Optional + coherent: Optional[CoherentTable] = None + atomics: Optional[AtomicsTable] = None + dma: Optional[DmaTable] = None + bi_dir: Optional[BiDirectionalTable] = None + + @computed_field + def bandwidth_from(self) -> Optional[int]: + """Get the bandwidth from the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[0]) + else: + # If the bandwidth is not in the expected format, return None + return None + + @computed_field + def bandwidth_to(self) -> Optional[int]: + """Get the bandwidth to the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[1]) + else: + # If the bandwidth is not in the expected format, return None + return None + + +class Topo(BaseModel): + gpu: int + bdf: str + links: list[TopoLink] + + +class AmdSmiTstData(BaseModel): + "Summary of amdsmitst results, with list and count of passing/skipped/failed tests" + + passed_tests: list[str] = Field(default_factory=list) + skipped_tests: list[str] = Field(default_factory=list) + failed_tests: list[str] = Field(default_factory=list) + passed_test_count: int = 0 + skipped_test_count: int = 0 + failed_test_count: int = 0 + + +class AmdSmiDataModel(DataModel): + """Data model for amd-smi data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + """ + + model_config = ConfigDict( + str_min_length=1, + str_strip_whitespace=True, + populate_by_name=True, + ) + + version: Optional[AmdSmiVersion] = None + gpu_list: Optional[list[AmdSmiListItem]] = Field(default_factory=list) + partition: Optional[Partition] = None + process: Optional[list[Processes]] = Field(default_factory=list) + topology: Optional[list[Topo]] = Field(default_factory=list) + firmware: Optional[list[Fw]] = Field(default_factory=list) + bad_pages: Optional[list[BadPages]] = Field(default_factory=list) + static: Optional[list[AmdSmiStatic]] = Field(default_factory=list) + metric: Optional[list[AmdSmiMetric]] = Field(default_factory=list) + xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list) + xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list) + cper_data: Optional[list[FileModel]] = Field(default_factory=list) + amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData) + + def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: + """Get the gpu list item for the given gpu id.""" + if self.gpu_list is None: + return None + for item in self.gpu_list: + if item.gpu == gpu: + return item + return None + + def get_process(self, gpu: int) -> Optional[Processes]: + """Get the process data for the given gpu id.""" + if self.process is None: + return None + for item in self.process: + if item.gpu == gpu: + return item + return None + + def get_firmware(self, gpu: int) -> Optional[Fw]: + """Get the firmware data for the given gpu id.""" + if self.firmware is None: + return None + for item in self.firmware: + if item.gpu == gpu: + return item + return None + + def get_static(self, gpu: int) -> Optional[AmdSmiStatic]: + """Get the static data for the given gpu id.""" + if self.static is None: + return None + for item in self.static: + if item.gpu == gpu: + return item + return None + + def get_bad_pages(self, gpu: int) -> Optional[BadPages]: + """Get the bad pages data for the given gpu id.""" + if self.bad_pages is None: + return None + for item in self.bad_pages: + if item.gpu == gpu: + return item + return None diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py new file mode 100644 index 00000000..333f37ae --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -0,0 +1,50 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from datetime import datetime +from typing import Optional + +from nodescraper.models import AnalyzerArgs + + +class AmdSmiAnalyzerArgs(AnalyzerArgs): + + check_static_data: bool = False + expected_gpu_processes: Optional[int] = None + expected_max_power: Optional[int] = None + expected_driver_version: Optional[str] = None + expected_memory_partition_mode: Optional[str] = None + expected_compute_partition_mode: Optional[str] = None + expected_pldm_version: Optional[str] = None + l0_to_recovery_count_error_threshold: Optional[int] = 3 + l0_to_recovery_count_warning_threshold: Optional[int] = 1 + vendorid_ep: Optional[str] = None + vendorid_ep_vf: Optional[str] = None + devid_ep: Optional[str] = None + devid_ep_vf: Optional[str] = None + sku_name: Optional[str] = None + expected_xgmi_speed: Optional[list[float]] = None + analysis_range_start: Optional[datetime] = None + analysis_range_end: Optional[datetime] = None diff --git a/nodescraper/plugins/inband/amdsmi/cper.py b/nodescraper/plugins/inband/amdsmi/cper.py new file mode 100644 index 00000000..548a38bd --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/cper.py @@ -0,0 +1,65 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import io +from datetime import datetime +from typing import Dict, Optional + +from nodescraper.enums import EventCategory, EventPriority + + +class CperAnalysisTaskMixin: + def analyzer_cpers( + self, + cper_data: Dict[str, io.BytesIO], + analysis_range_start: Optional[datetime], + analysis_range_end: Optional[datetime], + ): + """Generate Events from CPER data. + + Note: CPER analysis is not currently implemented. This is a stub that logs + a warning when CPER data is present. + + Args: + cper_data (Dict[str, io.BytesIO]): Dictionary of CPER file names to file contents + analysis_range_start (Optional[datetime]): Optional start time for analysis range + analysis_range_end (Optional[datetime]): Optional end time for analysis range + """ + # check the self._log_event method is defined + if not hasattr(self, "_log_event") or not callable(self._log_event): + raise NotImplementedError("The class must implement the _log_event method.") + + if cper_data: + self._log_event( + category=EventCategory.RAS, + priority=EventPriority.WARNING, + description="CPER data found but analysis is not implemented", + data={ + "cper_file_count": len(cper_data), + "cper_files": list(cper_data.keys()), + "note": "CPER analysis requires additional dependencies not currently available", + }, + ) diff --git a/nodescraper/plugins/inband/device_enumeration/__init__.py b/nodescraper/plugins/inband/device_enumeration/__init__.py new file mode 100644 index 00000000..a5073399 --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/__init__.py @@ -0,0 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .analyzer_args import DeviceEnumerationAnalyzerArgs +from .device_enumeration_plugin import DeviceEnumerationPlugin + +__all__ = ["DeviceEnumerationPlugin", "DeviceEnumerationAnalyzerArgs"] diff --git a/nodescraper/plugins/inband/device_enumeration/analyzer_args.py b/nodescraper/plugins/inband/device_enumeration/analyzer_args.py new file mode 100644 index 00000000..8f74ed00 --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/analyzer_args.py @@ -0,0 +1,73 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Any, Optional + +from pydantic import field_validator + +from nodescraper.models import AnalyzerArgs + +from .deviceenumdata import DeviceEnumerationDataModel + + +class DeviceEnumerationAnalyzerArgs(AnalyzerArgs): + cpu_count: Optional[list[int]] = None + gpu_count: Optional[list[int]] = None + vf_count: Optional[list[int]] = None + + @field_validator("cpu_count", "gpu_count", "vf_count", mode="before") + @classmethod + def normalize_to_list(cls, v: Any) -> Optional[list[int]]: + """Convert single integer values to lists for consistent handling. + + Args: + v: The input value (can be int, list[int], or None). + + Returns: + Optional[list[int]]: The normalized list value or None. + """ + if v is None: + return None + if isinstance(v, int): + return [v] + return v + + @classmethod + def build_from_model( + cls, datamodel: DeviceEnumerationDataModel + ) -> "DeviceEnumerationAnalyzerArgs": + """build analyzer args from data model + + Args: + datamodel (DeviceEnumerationDataModel): data model for plugin + + Returns: + DeviceEnumerationAnalyzerArgs: instance of analyzer args class + """ + return cls( + cpu_count=[datamodel.cpu_count] if datamodel.cpu_count is not None else None, + gpu_count=[datamodel.gpu_count] if datamodel.gpu_count is not None else None, + vf_count=[datamodel.vf_count] if datamodel.vf_count is not None else None, + ) diff --git a/nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py b/nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py new file mode 100644 index 00000000..7cf39335 --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py @@ -0,0 +1,81 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult + +from .analyzer_args import DeviceEnumerationAnalyzerArgs +from .deviceenumdata import DeviceEnumerationDataModel + + +class DeviceEnumerationAnalyzer( + DataAnalyzer[DeviceEnumerationDataModel, DeviceEnumerationAnalyzerArgs] +): + """Check Device Enumeration matches expected cpu and gpu count + supported by all OSs, SKUs, and platforms.""" + + DATA_MODEL = DeviceEnumerationDataModel + + def analyze_data( + self, data: DeviceEnumerationDataModel, args: Optional[DeviceEnumerationAnalyzerArgs] = None + ) -> TaskResult: + + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = ( + "Expected Device Enumeration data not provided, skipping analysis." + ) + return self.result + + checks = {} + if args.cpu_count is not None and args.cpu_count != []: + checks["cpu_count"] = args.cpu_count + if args.gpu_count is not None and args.gpu_count != []: + checks["gpu_count"] = args.gpu_count + if args.vf_count is not None and args.vf_count != []: + checks["vf_count"] = args.vf_count + + self.result.message = "" + for check, accepted_counts in checks.items(): + actual_count = getattr(data, check) + if actual_count not in accepted_counts: + message = f"Expected {check} in {accepted_counts}, but got {actual_count}. " + self.result.message += message + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.PLATFORM, + description=message, + data={check: actual_count}, + priority=EventPriority.CRITICAL, + console_log=True, + ) + if self.result.message == "": + self.result.status = ExecutionStatus.OK + self.result.message = f"Device Enumeration validated on {checks.keys()}." + + return self.result diff --git a/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py new file mode 100644 index 00000000..82a82f91 --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py @@ -0,0 +1,176 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband.inband import CommandArtifact, TextFileArtifact +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily +from nodescraper.models import TaskResult + +from .deviceenumdata import DeviceEnumerationDataModel + + +class DeviceEnumerationCollector(InBandDataCollector[DeviceEnumerationDataModel, None]): + """Collect CPU and GPU count""" + + DATA_MODEL = DeviceEnumerationDataModel + + CMD_GPU_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'VGA\\|Display\\|3D' | wc -l" + CMD_VF_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l" + CMD_LSCPU_LINUX = "lscpu" + CMD_LSHW_LINUX = "lshw" + + CMD_CPU_COUNT_WINDOWS = ( + 'powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"' + ) + CMD_GPU_COUNT_WINDOWS = 'powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"' + CMD_VF_COUNT_WINDOWS = ( + 'powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count"' + ) + + def _warning( + self, + description: str, + command: CommandArtifact, + category: EventCategory = EventCategory.PLATFORM, + ): + self._log_event( + category=category, + description=description, + data={ + "command": command.command, + "exit_code": command.exit_code, + "stderr": command.stderr, + }, + priority=EventPriority.WARNING, + ) + + def collect_data(self, args=None) -> tuple[TaskResult, Optional[DeviceEnumerationDataModel]]: + """ + Read CPU and GPU count + On Linux, use lscpu and lspci + On Windows, use WMI and hyper-v cmdlets + """ + if self.system_info.os_family == OSFamily.LINUX: + lscpu_res = self._run_sut_cmd(self.CMD_LSCPU_LINUX, log_artifact=False) + + # Count all AMD GPUs + vendor_id = format(self.system_info.vendorid_ep, "x") + gpu_count_res = self._run_sut_cmd( + self.CMD_GPU_COUNT_LINUX.format(vendorid_ep=vendor_id) + ) + + # Count AMD Virtual Functions + vf_count_res = self._run_sut_cmd(self.CMD_VF_COUNT_LINUX.format(vendorid_ep=vendor_id)) + + # Collect lshw output + lshw_res = self._run_sut_cmd(self.CMD_LSHW_LINUX, sudo=True, log_artifact=False) + else: + cpu_count_res = self._run_sut_cmd(self.CMD_CPU_COUNT_WINDOWS) + gpu_count_res = self._run_sut_cmd(self.CMD_GPU_COUNT_WINDOWS) + vf_count_res = self._run_sut_cmd(self.CMD_VF_COUNT_WINDOWS) + + device_enum = DeviceEnumerationDataModel() + + if self.system_info.os_family == OSFamily.LINUX: + if lscpu_res.exit_code == 0 and lscpu_res.stdout: + # Extract socket count from lscpu output + for line in lscpu_res.stdout.splitlines(): + if line.startswith("Socket(s):"): + try: + device_enum.cpu_count = int(line.split(":")[1].strip()) + break + except (ValueError, IndexError): + self._warning( + description="Cannot parse CPU count from lscpu output", + command=lscpu_res, + ) + device_enum.lscpu_output = lscpu_res.stdout + self._log_event( + category=EventCategory.PLATFORM, + description="Collected lscpu output", + priority=EventPriority.INFO, + ) + else: + self._warning(description="Cannot collect lscpu output", command=lscpu_res) + else: + if cpu_count_res.exit_code == 0: + device_enum.cpu_count = int(cpu_count_res.stdout) + else: + self._warning(description="Cannot determine CPU count", command=cpu_count_res) + + if gpu_count_res.exit_code == 0: + device_enum.gpu_count = int(gpu_count_res.stdout) + else: + self._warning(description="Cannot determine GPU count", command=gpu_count_res) + + if vf_count_res.exit_code == 0: + device_enum.vf_count = int(vf_count_res.stdout) + else: + self._warning( + description="Cannot determine VF count", + command=vf_count_res, + category=EventCategory.SW_DRIVER, + ) + + # Collect lshw output on Linux + if self.system_info.os_family == OSFamily.LINUX: + if lshw_res.exit_code == 0 and lshw_res.stdout: + device_enum.lshw_output = lshw_res.stdout + self.result.artifacts.append( + TextFileArtifact(filename="lshw.txt", contents=lshw_res.stdout) + ) + self._log_event( + category=EventCategory.PLATFORM, + description="Collected lshw output", + priority=EventPriority.INFO, + ) + else: + self._warning(description="Cannot collect lshw output", command=lshw_res) + + if device_enum.cpu_count or device_enum.gpu_count or device_enum.vf_count: + log_data = device_enum.model_dump( + exclude_none=True, + exclude={"lscpu_output", "lshw_output", "task_name", "task_type", "parent"}, + ) + self._log_event( + category=EventCategory.PLATFORM, + description=f"Counted {device_enum.cpu_count} CPUs, {device_enum.gpu_count} GPUs, {device_enum.vf_count} VFs", + data=log_data, + priority=EventPriority.INFO, + ) + self.result.message = f"Device Enumeration: {log_data}" + self.result.status = ExecutionStatus.OK + return self.result, device_enum + else: + self.result.message = "Device Enumeration info not found" + self.result.status = ExecutionStatus.EXECUTION_FAILURE + self._log_event( + category=EventCategory.SW_DRIVER, + description=self.result.message, + priority=EventPriority.CRITICAL, + ) + return self.result, None diff --git a/nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py b/nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py new file mode 100644 index 00000000..baf2aa2d --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py @@ -0,0 +1,45 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .analyzer_args import DeviceEnumerationAnalyzerArgs +from .device_enumeration_analyzer import DeviceEnumerationAnalyzer +from .device_enumeration_collector import DeviceEnumerationCollector +from .deviceenumdata import DeviceEnumerationDataModel + + +class DeviceEnumerationPlugin( + InBandDataPlugin[DeviceEnumerationDataModel, None, DeviceEnumerationAnalyzerArgs] +): + """Plugin for collection and analysis of BIOS data""" + + DATA_MODEL = DeviceEnumerationDataModel + + COLLECTOR = DeviceEnumerationCollector + + ANALYZER = DeviceEnumerationAnalyzer + + ANALYZER_ARGS = DeviceEnumerationAnalyzerArgs diff --git a/nodescraper/plugins/inband/device_enumeration/deviceenumdata.py b/nodescraper/plugins/inband/device_enumeration/deviceenumdata.py new file mode 100644 index 00000000..bef13492 --- /dev/null +++ b/nodescraper/plugins/inband/device_enumeration/deviceenumdata.py @@ -0,0 +1,36 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.models import DataModel + + +class DeviceEnumerationDataModel(DataModel): + cpu_count: Optional[int] = None + gpu_count: Optional[int] = None + vf_count: Optional[int] = None + lscpu_output: Optional[str] = None + lshw_output: Optional[str] = None diff --git a/nodescraper/plugins/inband/dimm/dimm_collector.py b/nodescraper/plugins/inband/dimm/dimm_collector.py index 167913b5..a3ee84ee 100644 --- a/nodescraper/plugins/inband/dimm/dimm_collector.py +++ b/nodescraper/plugins/inband/dimm/dimm_collector.py @@ -26,6 +26,7 @@ from typing import Optional from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult @@ -40,6 +41,7 @@ class DimmCollector(InBandDataCollector[DimmDataModel, DimmCollectorArgs]): CMD_WINDOWS = "wmic memorychip get Capacity" CMD = """sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\\|None\\|Module" | grep Size' 2>/dev/null""" + CMD_DMIDECODE_FULL = "dmidecode" def collect_data( self, @@ -72,6 +74,25 @@ def collect_data( self.result.message = "Skipping sudo plugin" self.result.status = ExecutionStatus.NOT_RAN return self.result, None + + # Collect full dmidecode output as artifact + dmidecode_full_res = self._run_sut_cmd(self.CMD_DMIDECODE_FULL, sudo=True) + if dmidecode_full_res.exit_code == 0 and dmidecode_full_res.stdout: + self.result.artifacts.append( + TextFileArtifact(filename="dmidecode.txt", contents=dmidecode_full_res.stdout) + ) + else: + self._log_event( + category=EventCategory.OS, + description="Could not collect full dmidecode output", + data={ + "command": dmidecode_full_res.command, + "exit_code": dmidecode_full_res.exit_code, + "stderr": dmidecode_full_res.stderr, + }, + priority=EventPriority.WARNING, + ) + res = self._run_sut_cmd(self.CMD, sudo=True) if res.exit_code == 0: total = 0 diff --git a/nodescraper/plugins/inband/dmesg/collector_args.py b/nodescraper/plugins/inband/dmesg/collector_args.py index a2313c54..22d85f17 100644 --- a/nodescraper/plugins/inband/dmesg/collector_args.py +++ b/nodescraper/plugins/inband/dmesg/collector_args.py @@ -36,3 +36,4 @@ class DmesgCollectorArgs(CollectorArgs): collect_rotated_logs: bool = False skip_sudo: bool = False + log_dmesg_data: bool = True diff --git a/nodescraper/plugins/inband/dmesg/dmesg_collector.py b/nodescraper/plugins/inband/dmesg/dmesg_collector.py index 5e7148f1..c280d7d2 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_collector.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_collector.py @@ -155,7 +155,9 @@ def collect_data( self._collect_dmesg_rotations() if dmesg_content: - dmesg_data = DmesgData(dmesg_content=dmesg_content) + dmesg_data = DmesgData( + dmesg_content=dmesg_content, skip_log_file=not args.log_dmesg_data + ) self.result.message = "Dmesg data collected" return self.result, dmesg_data diff --git a/nodescraper/plugins/inband/dmesg/dmesgdata.py b/nodescraper/plugins/inband/dmesg/dmesgdata.py index 541b3ea0..26c7f9f3 100644 --- a/nodescraper/plugins/inband/dmesg/dmesgdata.py +++ b/nodescraper/plugins/inband/dmesg/dmesgdata.py @@ -35,6 +35,7 @@ class DmesgData(DataModel): """Data model for in band dmesg log""" dmesg_content: str + skip_log_file: bool = False @classmethod def get_new_dmesg_lines(cls, current_dmesg: str, new_dmesg: str) -> str: @@ -83,6 +84,8 @@ def log_model(self, log_path: str): Args: log_path (str): log path """ + if self.skip_log_file: + return log_name = os.path.join(log_path, get_unique_filename(log_path, "dmesg.log")) with open(log_name, "w", encoding="utf-8") as log_file: log_file.write(self.dmesg_content) diff --git a/nodescraper/plugins/inband/journal/collector_args.py b/nodescraper/plugins/inband/journal/collector_args.py new file mode 100644 index 00000000..583c94ab --- /dev/null +++ b/nodescraper/plugins/inband/journal/collector_args.py @@ -0,0 +1,33 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from typing import Optional + +from nodescraper.models import CollectorArgs + + +class JournalCollectorArgs(CollectorArgs): + boot: Optional[int] = None diff --git a/nodescraper/plugins/inband/journal/journal_collector.py b/nodescraper/plugins/inband/journal/journal_collector.py index ecf48e0a..6b41dcc1 100644 --- a/nodescraper/plugins/inband/journal/journal_collector.py +++ b/nodescraper/plugins/inband/journal/journal_collector.py @@ -25,27 +25,50 @@ ############################################################################### from typing import Optional +from pydantic import ValidationError + from nodescraper.base import InBandDataCollector from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_details +from .collector_args import JournalCollectorArgs from .journaldata import JournalData -class JournalCollector(InBandDataCollector[JournalData, None]): +class JournalCollector(InBandDataCollector[JournalData, JournalCollectorArgs]): """Read journal log via journalctl.""" SUPPORTED_OS_FAMILY = {OSFamily.LINUX} DATA_MODEL = JournalData CMD = "journalctl --no-pager --system --output=short-iso" - def _read_with_journalctl(self): + def _read_with_journalctl(self, args: Optional[JournalCollectorArgs] = None): """Read journal logs using journalctl Returns: str|None: system journal read """ - res = self._run_sut_cmd(self.CMD, sudo=True, log_artifact=False, strip=False) + + cmd = "journalctl --no-pager --system --output=short-iso" + try: + # safe check for args.boot + if args is not None and getattr(args, "boot", None): + cmd = f"journalctl --no-pager -b {args.boot} --system --output=short-iso" + + res = self._run_sut_cmd(cmd, sudo=True, log_artifact=False, strip=False) + + except ValidationError as val_err: + self._log_event( + category=EventCategory.OS, + description="Exception while running journalctl", + data=get_exception_details(val_err), + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.message = "Could not read journalctl data" + self.result.status = ExecutionStatus.ERROR + return None if res.exit_code != 0: self._log_event( @@ -61,16 +84,22 @@ def _read_with_journalctl(self): return res.stdout - def collect_data(self, args=None) -> tuple[TaskResult, Optional[JournalData]]: + def collect_data( + self, + args: Optional[JournalCollectorArgs] = None, + ) -> tuple[TaskResult, Optional[JournalData]]: """Collect journal logs Args: args (_type_, optional): Collection args. Defaults to None. Returns: - tuple[TaskResult, Optional[JournalData, None]]: Tuple of results and data model or none. + tuple[TaskResult, Optional[JournalData]]: Tuple of results and data model or none. """ - journal_log = self._read_with_journalctl() + if args is None: + args = JournalCollectorArgs() + + journal_log = self._read_with_journalctl(args) if journal_log: data = JournalData(journal_log=journal_log) self.result.message = self.result.message or "Journal data collected" diff --git a/nodescraper/plugins/inband/journal/journal_plugin.py b/nodescraper/plugins/inband/journal/journal_plugin.py index 72ccca5a..a3044fbe 100644 --- a/nodescraper/plugins/inband/journal/journal_plugin.py +++ b/nodescraper/plugins/inband/journal/journal_plugin.py @@ -25,13 +25,16 @@ ############################################################################### from nodescraper.base import InBandDataPlugin +from .collector_args import JournalCollectorArgs from .journal_collector import JournalCollector from .journaldata import JournalData -class JournalPlugin(InBandDataPlugin[JournalData, None, None]): +class JournalPlugin(InBandDataPlugin[JournalData, JournalCollectorArgs, None]): """Plugin for collection of journal data""" DATA_MODEL = JournalData COLLECTOR = JournalCollector + + COLLECTOR_ARGS = JournalCollectorArgs diff --git a/nodescraper/plugins/inband/kernel/kernel_collector.py b/nodescraper/plugins/inband/kernel/kernel_collector.py index e84973ff..a9ac81ad 100644 --- a/nodescraper/plugins/inband/kernel/kernel_collector.py +++ b/nodescraper/plugins/inband/kernel/kernel_collector.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +import re from typing import Optional from nodescraper.base import InBandDataCollector @@ -37,7 +38,31 @@ class KernelCollector(InBandDataCollector[KernelDataModel, None]): DATA_MODEL = KernelDataModel CMD_WINDOWS = "wmic os get Version /Value" - CMD = "sh -c 'uname -r'" + CMD = "sh -c 'uname -a'" + + def _parse_kernel_version(self, uname_a: str) -> Optional[str]: + """Extract the kernel release from `uname -a` output. + + Args: + uname_a (str): The full output string from the `uname -a` command. + + Returns: + Optional[str]: The parsed kernel release (e.g., "5.13.0-30-generic") + if found, otherwise None. + """ + if not uname_a: + return None + + result = uname_a.strip().split() + if len(result) >= 3: + return result[2] + + # if some change in output look for a version-like string (e.g. 4.18.0-553.el8_10.x86_64) + match = re.search(r"\d+\.\d+\.\d+[\w\-\.]*", uname_a) + if match: + return match.group(0) + + return None def collect_data( self, @@ -51,16 +76,28 @@ def collect_data( """ kernel = None + kernel_info = None + if self.system_info.os_family == OSFamily.WINDOWS: res = self._run_sut_cmd(self.CMD_WINDOWS) if res.exit_code == 0: + kernel_info = res.stdout kernel = [line for line in res.stdout.splitlines() if "Version=" in line][0].split( "=" )[1] else: res = self._run_sut_cmd(self.CMD) if res.exit_code == 0: - kernel = res.stdout + kernel_info = res.stdout + kernel = self._parse_kernel_version(kernel_info) + if not kernel: + self._log_event( + category=EventCategory.OS, + description="Could not extract kernel version from 'uname -a'", + data={"command": res.command, "exit_code": res.exit_code}, + priority=EventPriority.ERROR, + console_log=True, + ) if res.exit_code != 0: self._log_event( @@ -71,8 +108,9 @@ def collect_data( console_log=True, ) - if kernel: - kernel_data = KernelDataModel(kernel_version=kernel) + if kernel_info and kernel: + + kernel_data = KernelDataModel(kernel_info=kernel_info, kernel_version=kernel) self._log_event( category="KERNEL_READ", description="Kernel version read", @@ -82,6 +120,10 @@ def collect_data( else: kernel_data = None - self.result.message = f"Kernel: {kernel}" if kernel else "Kernel not found" - self.result.status = ExecutionStatus.OK if kernel else ExecutionStatus.ERROR + self.result.message = ( + "Kernel not found" + if not kernel_info + else f"Kernel info: {kernel_info} | Kernel version: {kernel if kernel else 'Kernel version not found'}" + ) + self.result.status = ExecutionStatus.OK if kernel_info else ExecutionStatus.ERROR return self.result, kernel_data diff --git a/nodescraper/plugins/inband/kernel/kerneldata.py b/nodescraper/plugins/inband/kernel/kerneldata.py index edaa30e5..45f521eb 100644 --- a/nodescraper/plugins/inband/kernel/kerneldata.py +++ b/nodescraper/plugins/inband/kernel/kerneldata.py @@ -23,8 +23,10 @@ # SOFTWARE. # ############################################################################### + from nodescraper.models import DataModel class KernelDataModel(DataModel): + kernel_info: str kernel_version: str diff --git a/nodescraper/plugins/inband/kernel_module/kernel_module_collector.py b/nodescraper/plugins/inband/kernel_module/kernel_module_collector.py index 48d4de91..5eb9ff2e 100644 --- a/nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +++ b/nodescraper/plugins/inband/kernel_module/kernel_module_collector.py @@ -26,11 +26,12 @@ from typing import Optional from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact from nodescraper.connection.inband.inband import CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult -from .kernel_module_data import KernelModuleDataModel +from .kernel_module_data import KernelModuleDataModel, ModuleInfo, ModuleParameter class KernelModuleCollector(InBandDataCollector[KernelModuleDataModel, None]): @@ -39,6 +40,7 @@ class KernelModuleCollector(InBandDataCollector[KernelModuleDataModel, None]): DATA_MODEL = KernelModuleDataModel CMD_WINDOWS = "wmic os get Version /Value" CMD = "cat /proc/modules" + CMD_MODINFO_AMDGPU = "modinfo amdgpu" def parse_proc_modules(self, output: dict) -> dict: """Parse command output and return dict of modules @@ -60,6 +62,77 @@ def parse_proc_modules(self, output: dict) -> dict: } return modules + def _parse_modinfo(self, output: str) -> Optional[ModuleInfo]: + """Parse modinfo command output into structured ModuleInfo + + Args: + output (str): modinfo command output + + Returns: + Optional[ModuleInfo]: parsed module information or None if parsing fails + """ + if not output or not output.strip(): + return None + + module_info = ModuleInfo() + + for line in output.splitlines(): + line = line.strip() + if not line or ":" not in line: + continue + + field, _, value = line.partition(":") + field = field.strip() + value = value.strip() + + if field == "filename": + module_info.filename = value + elif field == "version": + module_info.version = value + elif field == "license": + module_info.license = value + elif field == "description": + module_info.description = value + elif field == "author": + module_info.author.append(value) + elif field == "firmware": + module_info.firmware.append(value) + elif field == "srcversion": + module_info.srcversion = value + elif field == "depends": + if value: + module_info.depends = [dep.strip() for dep in value.split(",") if dep.strip()] + elif field == "name": + module_info.name = value + elif field == "vermagic": + module_info.vermagic = value + elif field == "sig_id": + module_info.sig_id = value + elif field == "signer": + module_info.signer = value + elif field == "sig_key": + module_info.sig_key = value + elif field == "sig_hashalgo": + module_info.sig_hashalgo = value + elif field == "parm": + param_name, param_desc = value.split(":", 1) if ":" in value else (value, "") + param_name = param_name.strip() + param_desc = param_desc.strip() + + param_type = None + if param_desc and "(" in param_desc and ")" in param_desc: + type_start = param_desc.rfind("(") + type_end = param_desc.rfind(")") + if type_start < type_end: + param_type = param_desc[type_start + 1 : type_end].strip() + param_desc = param_desc[:type_start].strip() + + module_info.parm.append( + ModuleParameter(name=param_name, type=param_type, description=param_desc) + ) + + return module_info + def get_module_parameters(self, module_name: str) -> dict: """Fetches parameter names and values for a given kernel module using _run_sut_cmd @@ -143,8 +216,39 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[KernelModuleData else: kernel_modules = self.collect_all_module_info() + amdgpu_modinfo = None + if self.system_info.os_family != OSFamily.WINDOWS: + # Collect and parse modinfo amdgpu output + modinfo_res = self._run_sut_cmd(self.CMD_MODINFO_AMDGPU) + if modinfo_res.exit_code == 0 and modinfo_res.stdout: + amdgpu_modinfo = self._parse_modinfo(modinfo_res.stdout) + if amdgpu_modinfo: + self.result.artifacts.append( + TextFileArtifact(filename="modinfo_amdgpu.txt", contents=modinfo_res.stdout) + ) + else: + self._log_event( + category=EventCategory.OS, + description="Could not parse modinfo amdgpu output", + data={"command": modinfo_res.command}, + priority=EventPriority.WARNING, + ) + else: + self._log_event( + category=EventCategory.OS, + description="Could not collect modinfo amdgpu output", + data={ + "command": modinfo_res.command, + "exit_code": modinfo_res.exit_code, + "stderr": modinfo_res.stderr, + }, + priority=EventPriority.WARNING, + ) + if kernel_modules: - km_data = KernelModuleDataModel(kernel_modules=kernel_modules) + km_data = KernelModuleDataModel( + kernel_modules=kernel_modules, amdgpu_modinfo=amdgpu_modinfo + ) self._log_event( category="KERNEL_READ", description="Kernel modules read", diff --git a/nodescraper/plugins/inband/kernel_module/kernel_module_data.py b/nodescraper/plugins/inband/kernel_module/kernel_module_data.py index f9f91b61..8483a14f 100644 --- a/nodescraper/plugins/inband/kernel_module/kernel_module_data.py +++ b/nodescraper/plugins/inband/kernel_module/kernel_module_data.py @@ -24,8 +24,37 @@ # ############################################################################### +from typing import Optional + +from pydantic import BaseModel, Field + from nodescraper.models import DataModel +class ModuleParameter(BaseModel): + name: str + type: Optional[str] = None + description: Optional[str] = None + + +class ModuleInfo(BaseModel): + filename: Optional[str] = None + version: Optional[str] = None + license: Optional[str] = None + description: Optional[str] = None + author: list[str] = Field(default_factory=list) + firmware: list[str] = Field(default_factory=list) + srcversion: Optional[str] = None + depends: list[str] = Field(default_factory=list) + name: Optional[str] = None + vermagic: Optional[str] = None + sig_id: Optional[str] = None + signer: Optional[str] = None + sig_key: Optional[str] = None + sig_hashalgo: Optional[str] = None + parm: list[ModuleParameter] = Field(default_factory=list) + + class KernelModuleDataModel(DataModel): kernel_modules: dict + amdgpu_modinfo: Optional[ModuleInfo] = None diff --git a/nodescraper/plugins/inband/memory/analyzer_args.py b/nodescraper/plugins/inband/memory/analyzer_args.py index cc5f0ef4..968641ca 100644 --- a/nodescraper/plugins/inband/memory/analyzer_args.py +++ b/nodescraper/plugins/inband/memory/analyzer_args.py @@ -23,9 +23,23 @@ # SOFTWARE. # ############################################################################### -from pydantic import BaseModel +from nodescraper.models.analyzerargs import AnalyzerArgs +from .memorydata import MemoryDataModel -class MemoryAnalyzerArgs(BaseModel): + +class MemoryAnalyzerArgs(AnalyzerArgs): ratio: float = 0.66 memory_threshold: str = "30Gi" + + @classmethod + def build_from_model(cls, datamodel: MemoryDataModel) -> "MemoryAnalyzerArgs": + """build analyzer args from data model + + Args: + datamodel (MemoryDataModel): data model for plugin + + Returns: + MemoryAnalyzerArgs: instance of analyzer args class + """ + return cls(memory_threshold=datamodel.mem_total) diff --git a/nodescraper/plugins/inband/memory/memory_collector.py b/nodescraper/plugins/inband/memory/memory_collector.py index 7f768c65..43dd39ad 100644 --- a/nodescraper/plugins/inband/memory/memory_collector.py +++ b/nodescraper/plugins/inband/memory/memory_collector.py @@ -30,7 +30,15 @@ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult -from .memorydata import MemoryDataModel +from .memorydata import ( + LsmemData, + MemoryBlock, + MemoryDataModel, + MemorySummary, + NumaDistance, + NumaNode, + NumaTopology, +) class MemoryCollector(InBandDataCollector[MemoryDataModel, None]): @@ -42,6 +50,8 @@ class MemoryCollector(InBandDataCollector[MemoryDataModel, None]): "wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value" ) CMD = "free -b" + CMD_LSMEM = "lsmem" + CMD_NUMACTL = "numactl -H" def collect_data(self, args=None) -> tuple[TaskResult, Optional[MemoryDataModel]]: """ @@ -78,15 +88,91 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[MemoryDataModel] console_log=True, ) + lsmem_data = None + if self.system_info.os_family != OSFamily.WINDOWS: + lsmem_cmd = self._run_sut_cmd(self.CMD_LSMEM) + if lsmem_cmd.exit_code == 0: + lsmem_data = self._parse_lsmem_output(lsmem_cmd.stdout) + if lsmem_data: + self._log_event( + category=EventCategory.OS, + description="lsmem output collected", + data={ + "memory_blocks": len(lsmem_data.memory_blocks), + "total_online_memory": lsmem_data.summary.total_online_memory, + }, + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.OS, + description="Failed to parse lsmem output", + priority=EventPriority.WARNING, + console_log=False, + ) + else: + self._log_event( + category=EventCategory.OS, + description="Error running lsmem command", + data={ + "command": lsmem_cmd.command, + "exit_code": lsmem_cmd.exit_code, + "stderr": lsmem_cmd.stderr, + }, + priority=EventPriority.WARNING, + console_log=False, + ) + + # Collect NUMA topology information + numa_topology = None + if self.system_info.os_family != OSFamily.WINDOWS: + numactl_cmd = self._run_sut_cmd(self.CMD_NUMACTL) + if numactl_cmd.exit_code == 0: + numa_topology = self._parse_numactl_hardware(numactl_cmd.stdout) + if numa_topology: + self._log_event( + category=EventCategory.MEMORY, + description="NUMA topology collected", + data={ + "available_nodes": numa_topology.available_nodes, + "node_count": len(numa_topology.nodes), + }, + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.MEMORY, + description="Failed to parse numactl output", + priority=EventPriority.WARNING, + console_log=False, + ) + else: + self._log_event( + category=EventCategory.MEMORY, + description="Error running numactl command", + data={ + "command": numactl_cmd.command, + "exit_code": numactl_cmd.exit_code, + "stderr": numactl_cmd.stderr, + }, + priority=EventPriority.WARNING, + console_log=False, + ) + if mem_free and mem_total: - mem_data = MemoryDataModel(mem_free=mem_free, mem_total=mem_total) + mem_data = MemoryDataModel( + mem_free=mem_free, + mem_total=mem_total, + lsmem_data=lsmem_data, + numa_topology=numa_topology, + ) self._log_event( category=EventCategory.OS, description="Free and total memory read", data=mem_data.model_dump(), priority=EventPriority.INFO, ) - self.result.message = f"Memory: {mem_data.model_dump()}" + self.result.message = f"Memory: mem_free={mem_free}, mem_total={mem_total}" self.result.status = ExecutionStatus.OK else: mem_data = None @@ -94,3 +180,151 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[MemoryDataModel] self.result.status = ExecutionStatus.ERROR return self.result, mem_data + + def _parse_lsmem_output(self, output: str): + """ + Parse lsmem command output into a structured LsmemData object. + + Args: + output: Raw stdout from lsmem command + + Returns: + LsmemData: Parsed lsmem data with memory blocks and summary information + """ + lines = output.strip().split("\n") + memory_blocks = [] + summary_dict = {} + + for line in lines: + line = line.strip() + if not line: + continue + + # Parse mem range lines (sample: "0x0000000000000000-0x000000007fffffff 2G online yes 0-15") + if line.startswith("0x"): + parts = line.split() + if len(parts) >= 4: + memory_blocks.append( + MemoryBlock( + range=parts[0], + size=parts[1], + state=parts[2], + removable=parts[3] if len(parts) > 3 else None, + block=parts[4] if len(parts) > 4 else None, + ) + ) + # Parse summary lines + elif ":" in line: + key, value = line.split(":", 1) + summary_dict[key.strip().lower().replace(" ", "_")] = value.strip() + + summary = MemorySummary( + memory_block_size=summary_dict.get("memory_block_size"), + total_online_memory=summary_dict.get("total_online_memory"), + total_offline_memory=summary_dict.get("total_offline_memory"), + ) + + if not memory_blocks: + return None + + return LsmemData(memory_blocks=memory_blocks, summary=summary) + + def _parse_numactl_hardware(self, output: str): + """ + Parse 'numactl -H' output into NumaTopology structure. + + Args: + output: Raw stdout from numactl -H command + + Returns: + NumaTopology object or None if parsing fails + """ + lines = output.strip().split("\n") + available_nodes = [] + nodes = [] + distances = [] + distance_matrix = {} + + current_section = None + + for line in lines: + line = line.strip() + if not line: + continue + + # Parse available nodes line + if line.startswith("available:"): + match = re.search(r"available:\s*(\d+)\s+nodes?\s*\(([^)]+)\)", line) + if match: + node_range = match.group(2) + if "-" in node_range: + start, end = node_range.split("-") + available_nodes = list(range(int(start), int(end) + 1)) + else: + available_nodes = [int(x.strip()) for x in node_range.split()] + + # Parse node CPU line + elif line.startswith("node") and "cpus:" in line: + match = re.search(r"node\s+(\d+)\s+cpus:\s*(.+)", line) + if match: + node_id = int(match.group(1)) + cpu_list_str = match.group(2).strip() + if cpu_list_str: + cpus = [int(x) for x in cpu_list_str.split()] + else: + cpus = [] + nodes.append(NumaNode(node_id=node_id, cpus=cpus)) + + # Parse node memory size + elif line.startswith("node") and "size:" in line: + match = re.search(r"node\s+(\d+)\s+size:\s*(\d+)\s*MB", line) + if match: + node_id = int(match.group(1)) + size_mb = int(match.group(2)) + # Find existing node and update + for node in nodes: + if node.node_id == node_id: + node.memory_size_mb = size_mb + break + + # Parse node free memory + elif line.startswith("node") and "free:" in line: + match = re.search(r"node\s+(\d+)\s+free:\s*(\d+)\s*MB", line) + if match: + node_id = int(match.group(1)) + free_mb = int(match.group(2)) + # Find existing node and update + for node in nodes: + if node.node_id == node_id: + node.memory_free_mb = free_mb + break + + # Parse distance matrix + elif line.startswith("node distances:"): + current_section = "distances" + + elif current_section == "distances": + if line.startswith("node") and ":" not in line: + continue + elif ":" in line: + parts = line.split(":") + if len(parts) == 2: + from_node = int(parts[0].strip()) + dist_values = [int(x) for x in parts[1].split()] + + distance_matrix[from_node] = {} + for to_node, dist in enumerate(dist_values): + distance_matrix[from_node][to_node] = dist + distances.append( + NumaDistance(from_node=from_node, to_node=to_node, distance=dist) + ) + + if not nodes: + return None + + return NumaTopology( + available_nodes=available_nodes if available_nodes else [n.node_id for n in nodes], + nodes=nodes, + distances=distances, + distance_matrix=distance_matrix if distance_matrix else None, + ) diff --git a/nodescraper/plugins/inband/memory/memory_plugin.py b/nodescraper/plugins/inband/memory/memory_plugin.py index 84a04de3..9162bd0a 100644 --- a/nodescraper/plugins/inband/memory/memory_plugin.py +++ b/nodescraper/plugins/inband/memory/memory_plugin.py @@ -39,3 +39,5 @@ class MemoryPlugin(InBandDataPlugin[MemoryDataModel, None, MemoryAnalyzerArgs]): COLLECTOR = MemoryCollector ANALYZER = MemoryAnalyzer + + ANALYZER_ARGS = MemoryAnalyzerArgs diff --git a/nodescraper/plugins/inband/memory/memorydata.py b/nodescraper/plugins/inband/memory/memorydata.py index f500ee2e..2687beaf 100644 --- a/nodescraper/plugins/inband/memory/memorydata.py +++ b/nodescraper/plugins/inband/memory/memorydata.py @@ -23,9 +23,68 @@ # SOFTWARE. # ############################################################################### +from typing import Optional + +from pydantic import BaseModel + from nodescraper.models import DataModel +class MemoryBlock(BaseModel): + """Memory block information from lsmem""" + + range: str + size: str + state: str + removable: Optional[str] = None + block: Optional[str] = None + + +class MemorySummary(BaseModel): + """Summary information from lsmem""" + + memory_block_size: Optional[str] = None + total_online_memory: Optional[str] = None + total_offline_memory: Optional[str] = None + + +class LsmemData(BaseModel): + """Complete lsmem output data""" + + memory_blocks: list[MemoryBlock] + summary: MemorySummary + + +class NumaNode(BaseModel): + """NUMA node information""" + + node_id: int + cpus: list[int] + memory_size_mb: Optional[int] = None + memory_free_mb: Optional[int] = None + + +class NumaDistance(BaseModel): + """Distance between two NUMA nodes""" + + from_node: int + to_node: int + distance: int + + +class NumaTopology(BaseModel): + """Complete NUMA topology from 'numactl --hardware'""" + + available_nodes: list[int] + nodes: list[NumaNode] + distances: list[NumaDistance] + distance_matrix: Optional[dict[int, dict[int, int]]] = None + + class MemoryDataModel(DataModel): + """Memory data model""" + mem_free: str mem_total: str + lsmem_data: Optional[LsmemData] = None + numa_topology: Optional[NumaTopology] = None diff --git a/nodescraper/plugins/inband/network/__init__.py b/nodescraper/plugins/inband/network/__init__.py new file mode 100644 index 00000000..b3119397 --- /dev/null +++ b/nodescraper/plugins/inband/network/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .network_plugin import NetworkPlugin + +__all__ = ["NetworkPlugin"] diff --git a/nodescraper/plugins/inband/network/network_collector.py b/nodescraper/plugins/inband/network/network_collector.py new file mode 100644 index 00000000..0f96e7c8 --- /dev/null +++ b/nodescraper/plugins/inband/network/network_collector.py @@ -0,0 +1,579 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from typing import Dict, List, Optional, Tuple + +from nodescraper.base import InBandDataCollector +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus +from nodescraper.models import TaskResult + +from .networkdata import ( + EthtoolInfo, + IpAddress, + Neighbor, + NetworkDataModel, + NetworkInterface, + Route, + RoutingRule, +) + + +class NetworkCollector(InBandDataCollector[NetworkDataModel, None]): + """Collect network configuration details using ip command""" + + DATA_MODEL = NetworkDataModel + CMD_ADDR = "ip addr show" + CMD_ROUTE = "ip route show" + CMD_RULE = "ip rule show" + CMD_NEIGHBOR = "ip neighbor show" + CMD_ETHTOOL_TEMPLATE = "sudo ethtool {interface}" + + def _parse_ip_addr(self, output: str) -> List[NetworkInterface]: + """Parse 'ip addr show' output into NetworkInterface objects. + + Args: + output: Raw output from 'ip addr show' command + + Returns: + List of NetworkInterface objects + """ + interfaces = {} + current_interface = None + + for line in output.splitlines(): + # Check if this is an interface header line + # Format: 1: lo: mtu 65536 qdisc noqueue state UNKNOWN ... + if re.match(r"^\d+:", line): + parts = line.split() + + # Extract interface index and name + idx_str = parts[0].rstrip(":") + try: + index = int(idx_str) + except ValueError: + index = None + + ifname = parts[1].rstrip(":") + current_interface = ifname + + # Extract flags + flags: List[str] = [] + if "<" in line: + flag_match = re.search(r"<([^>]+)>", line) + if flag_match: + flags = flag_match.group(1).split(",") + + # Extract other attributes + mtu = None + qdisc = None + state = None + + # Known keyword-value pairs + keyword_value_pairs = ["mtu", "qdisc", "state"] + + for i, part in enumerate(parts): + if part in keyword_value_pairs and i + 1 < len(parts): + if part == "mtu": + try: + mtu = int(parts[i + 1]) + except ValueError: + pass + elif part == "qdisc": + qdisc = parts[i + 1] + elif part == "state": + state = parts[i + 1] + + interfaces[ifname] = NetworkInterface( + name=ifname, + index=index, + state=state, + mtu=mtu, + qdisc=qdisc, + flags=flags, + ) + + # Check if this is a link line (contains MAC address) + # Format: link/ether 00:40:a6:96:d7:5a brd ff:ff:ff:ff:ff:ff + elif "link/" in line and current_interface: + parts = line.split() + if "link/ether" in parts: + idx = parts.index("link/ether") + if idx + 1 < len(parts): + interfaces[current_interface].mac_address = parts[idx + 1] + elif "link/loopback" in parts: + # Loopback interface + if len(parts) > 1: + interfaces[current_interface].mac_address = parts[1] + + # Check if this is an inet/inet6 address line + # Format: inet 10.228.152.67/22 brd 10.228.155.255 scope global noprefixroute enp129s0 + elif any(x in line for x in ["inet ", "inet6 "]) and current_interface: + parts = line.split() + + # Parse the IP address + family = None + address = None + prefix_len = None + scope = None + broadcast = None + + for i, part in enumerate(parts): + if part in ["inet", "inet6"]: + family = part + if i + 1 < len(parts): + addr_part = parts[i + 1] + if "/" in addr_part: + address, prefix = addr_part.split("/") + try: + prefix_len = int(prefix) + except ValueError: + pass + else: + address = addr_part + elif part == "scope" and i + 1 < len(parts): + scope = parts[i + 1] + elif part in ["brd", "broadcast"] and i + 1 < len(parts): + broadcast = parts[i + 1] + + if address and current_interface in interfaces: + ip_addr = IpAddress( + address=address, + prefix_len=prefix_len, + family=family, + scope=scope, + broadcast=broadcast, + label=current_interface, + ) + interfaces[current_interface].addresses.append(ip_addr) + + return list(interfaces.values()) + + def _parse_ip_route(self, output: str) -> List[Route]: + """Parse 'ip route show' output into Route objects. + + Args: + output: Raw output from 'ip route show' command + + Returns: + List of Route objects + """ + routes = [] + + for line in output.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split() + if not parts: + continue + + # First part is destination (can be "default" or a network) + destination = parts[0] + + route = Route(destination=destination) + + # Known keyword-value pairs + keyword_value_pairs = ["via", "dev", "proto", "scope", "metric", "src", "table"] + + # Parse route attributes + i = 1 + while i < len(parts): + if parts[i] in keyword_value_pairs and i + 1 < len(parts): + keyword = parts[i] + value = parts[i + 1] + + if keyword == "via": + route.gateway = value + elif keyword == "dev": + route.device = value + elif keyword == "proto": + route.protocol = value + elif keyword == "scope": + route.scope = value + elif keyword == "metric": + try: + route.metric = int(value) + except ValueError: + pass + elif keyword == "src": + route.source = value + elif keyword == "table": + route.table = value + i += 2 + else: + i += 1 + + routes.append(route) + + return routes + + def _parse_ip_rule(self, output: str) -> List[RoutingRule]: + """Parse 'ip rule show' output into RoutingRule objects. + Example ip rule: 200: from 172.16.0.0/12 to 8.8.8.8 iif wlan0 oif eth0 fwmark 0x20 table vpn_table + + Args: + output: Raw output from 'ip rule show' command + + Returns: + List of RoutingRule objects + """ + rules = [] + + for line in output.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split() + if not parts: + continue + + # First part is priority followed by ":" + priority_str = parts[0].rstrip(":") + try: + priority = int(priority_str) + except ValueError: + continue + + rule = RoutingRule(priority=priority) + + # Parse rule attributes + i = 1 + while i < len(parts): + if parts[i] == "from" and i + 1 < len(parts): + if parts[i + 1] != "all": + rule.source = parts[i + 1] + i += 2 + elif parts[i] == "to" and i + 1 < len(parts): + if parts[i + 1] != "all": + rule.destination = parts[i + 1] + i += 2 + elif parts[i] in ["lookup", "table"] and i + 1 < len(parts): + rule.table = parts[i + 1] + if parts[i] == "lookup": + rule.action = "lookup" + i += 2 + elif parts[i] == "iif" and i + 1 < len(parts): + rule.iif = parts[i + 1] + i += 2 + elif parts[i] == "oif" and i + 1 < len(parts): + rule.oif = parts[i + 1] + i += 2 + elif parts[i] == "fwmark" and i + 1 < len(parts): + rule.fwmark = parts[i + 1] + i += 2 + elif parts[i] in ["unreachable", "prohibit", "blackhole"]: + rule.action = parts[i] + i += 1 + else: + i += 1 + + rules.append(rule) + + return rules + + def _parse_ip_neighbor(self, output: str) -> List[Neighbor]: + """Parse 'ip neighbor show' output into Neighbor objects. + + Args: + output: Raw output from 'ip neighbor show' command + + Returns: + List of Neighbor objects + """ + neighbors = [] + + # Known keyword-value pairs (keyword takes next element as value) + keyword_value_pairs = ["dev", "lladdr", "nud", "vlan", "via"] + + for line in output.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split() + if not parts: + continue + + # First part is the IP address + ip_address = parts[0] + + neighbor = Neighbor(ip_address=ip_address) + + # Parse neighbor attributes + i = 1 + while i < len(parts): + current = parts[i] + + # Check for known keyword-value pairs + if current in keyword_value_pairs and i + 1 < len(parts): + if current == "dev": + neighbor.device = parts[i + 1] + elif current == "lladdr": + neighbor.mac_address = parts[i + 1] + # Other keyword-value pairs can be added here as needed + i += 2 + + # Check if it's a state (all uppercase, typically single word) + elif current.isupper() and current.isalpha(): + # States: REACHABLE, STALE, DELAY, PROBE, FAILED, INCOMPLETE, PERMANENT, NOARP + # Future states will also be captured + neighbor.state = current + i += 1 + + # Check if it looks like a MAC address (contains colons) + elif ":" in current and not current.startswith("http"): + # Already handled by lladdr, but in case it appears standalone + if not neighbor.mac_address: + neighbor.mac_address = current + i += 1 + + # Check if it looks like an IP address (has dots or is IPv6) + elif "." in current or ("::" in current): + # Skip IP addresses that might appear (already captured as first element) + i += 1 + + # Anything else that's a simple lowercase word is likely a flag + elif current.isalpha() and current.islower(): + # Flags: router, proxy, extern_learn, offload, managed, etc. + # Captures both known and future flags + neighbor.flags.append(current) + i += 1 + + else: + # Unknown format, skip it + i += 1 + + neighbors.append(neighbor) + + return neighbors + + def _parse_ethtool(self, interface: str, output: str) -> EthtoolInfo: + """Parse 'ethtool ' output into EthtoolInfo object. + + Args: + interface: Name of the network interface + output: Raw output from 'ethtool ' command + + Returns: + EthtoolInfo object with parsed data + """ + ethtool_info = EthtoolInfo(interface=interface, raw_output=output) + + # Parse line by line + current_section = None + for line in output.splitlines(): + line_stripped = line.strip() + if not line_stripped: + continue + + # Detect sections (lines ending with colon and no tab prefix) + if line_stripped.endswith(":") and not line.startswith("\t"): + current_section = line_stripped.rstrip(":") + continue + + # Parse key-value pairs (lines with colon in the middle) + if ":" in line_stripped: + # Split on first colon + parts = line_stripped.split(":", 1) + if len(parts) == 2: + key = parts[0].strip() + value = parts[1].strip() + + # Store in settings dict + ethtool_info.settings[key] = value + + # Extract specific important fields + if key == "Speed": + ethtool_info.speed = value + elif key == "Duplex": + ethtool_info.duplex = value + elif key == "Port": + ethtool_info.port = value + elif key == "Auto-negotiation": + ethtool_info.auto_negotiation = value + elif key == "Link detected": + ethtool_info.link_detected = value + + # Parse supported/advertised link modes (typically indented list items) + elif current_section in ["Supported link modes", "Advertised link modes"]: + # These are typically list items, possibly with speeds like "10baseT/Half" + if line.startswith("\t") or line.startswith(" "): + mode = line_stripped + if current_section == "Supported link modes": + ethtool_info.supported_link_modes.append(mode) + elif current_section == "Advertised link modes": + ethtool_info.advertised_link_modes.append(mode) + + return ethtool_info + + def _collect_ethtool_info(self, interfaces: List[NetworkInterface]) -> Dict[str, EthtoolInfo]: + """Collect ethtool information for all network interfaces. + + Args: + interfaces: List of NetworkInterface objects to collect ethtool info for + + Returns: + Dictionary mapping interface name to EthtoolInfo + """ + ethtool_data = {} + + for iface in interfaces: + cmd = self.CMD_ETHTOOL_TEMPLATE.format(interface=iface.name) + res_ethtool = self._run_sut_cmd(cmd) + + if res_ethtool.exit_code == 0: + ethtool_info = self._parse_ethtool(iface.name, res_ethtool.stdout) + ethtool_data[iface.name] = ethtool_info + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected ethtool info for interface: {iface.name}", + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description=f"Error collecting ethtool info for interface: {iface.name}", + data={"command": res_ethtool.command, "exit_code": res_ethtool.exit_code}, + priority=EventPriority.WARNING, + ) + + return ethtool_data + + def collect_data( + self, + args=None, + ) -> Tuple[TaskResult, Optional[NetworkDataModel]]: + """Collect network configuration from the system. + + Returns: + Tuple[TaskResult, Optional[NetworkDataModel]]: tuple containing the task result + and an instance of NetworkDataModel or None if collection failed. + """ + interfaces = [] + routes = [] + rules = [] + neighbors = [] + ethtool_data = {} + + # Collect interface/address information + res_addr = self._run_sut_cmd(self.CMD_ADDR) + if res_addr.exit_code == 0: + interfaces = self._parse_ip_addr(res_addr.stdout) + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected {len(interfaces)} network interfaces", + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description="Error collecting network interfaces", + data={"command": res_addr.command, "exit_code": res_addr.exit_code}, + priority=EventPriority.ERROR, + console_log=True, + ) + + # Collect ethtool information for interfaces + if interfaces: + ethtool_data = self._collect_ethtool_info(interfaces) + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected ethtool info for {len(ethtool_data)} interfaces", + priority=EventPriority.INFO, + ) + + # Collect routing table + res_route = self._run_sut_cmd(self.CMD_ROUTE) + if res_route.exit_code == 0: + routes = self._parse_ip_route(res_route.stdout) + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected {len(routes)} routes", + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description="Error collecting routes", + data={"command": res_route.command, "exit_code": res_route.exit_code}, + priority=EventPriority.WARNING, + ) + + # Collect routing rules + res_rule = self._run_sut_cmd(self.CMD_RULE) + if res_rule.exit_code == 0: + rules = self._parse_ip_rule(res_rule.stdout) + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected {len(rules)} routing rules", + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description="Error collecting routing rules", + data={"command": res_rule.command, "exit_code": res_rule.exit_code}, + priority=EventPriority.WARNING, + ) + + # Collect neighbor table (ARP/NDP) + res_neighbor = self._run_sut_cmd(self.CMD_NEIGHBOR) + if res_neighbor.exit_code == 0: + neighbors = self._parse_ip_neighbor(res_neighbor.stdout) + self._log_event( + category=EventCategory.NETWORK, + description=f"Collected {len(neighbors)} neighbor entries", + priority=EventPriority.INFO, + ) + else: + self._log_event( + category=EventCategory.NETWORK, + description="Error collecting neighbor table", + data={"command": res_neighbor.command, "exit_code": res_neighbor.exit_code}, + priority=EventPriority.WARNING, + ) + + if interfaces or routes or rules or neighbors: + network_data = NetworkDataModel( + interfaces=interfaces, + routes=routes, + rules=rules, + neighbors=neighbors, + ethtool_info=ethtool_data, + ) + self.result.message = ( + f"Collected network data: {len(interfaces)} interfaces, " + f"{len(routes)} routes, {len(rules)} rules, {len(neighbors)} neighbors, " + f"{len(ethtool_data)} ethtool entries" + ) + self.result.status = ExecutionStatus.OK + return self.result, network_data + else: + self.result.message = "Failed to collect network data" + self.result.status = ExecutionStatus.ERROR + return self.result, None diff --git a/nodescraper/plugins/inband/network/network_plugin.py b/nodescraper/plugins/inband/network/network_plugin.py new file mode 100644 index 00000000..2735e705 --- /dev/null +++ b/nodescraper/plugins/inband/network/network_plugin.py @@ -0,0 +1,37 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .network_collector import NetworkCollector +from .networkdata import NetworkDataModel + + +class NetworkPlugin(InBandDataPlugin[NetworkDataModel, None, None]): + """Plugin for collection of network configuration data""" + + DATA_MODEL = NetworkDataModel + + COLLECTOR = NetworkCollector diff --git a/nodescraper/plugins/inband/network/networkdata.py b/nodescraper/plugins/inband/network/networkdata.py new file mode 100644 index 00000000..5e94efc2 --- /dev/null +++ b/nodescraper/plugins/inband/network/networkdata.py @@ -0,0 +1,117 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Dict, List, Optional + +from pydantic import BaseModel, Field + +from nodescraper.models import DataModel + + +class IpAddress(BaseModel): + """Individual IP address on an interface""" + + address: str # "192.168.1.100" + prefix_len: Optional[int] = None # 24 + scope: Optional[str] = None # "global", "link", "host" + family: Optional[str] = None # "inet", "inet6" + label: Optional[str] = None # interface label/alias + broadcast: Optional[str] = None # broadcast address + + +class NetworkInterface(BaseModel): + """Network interface information""" + + name: str # "eth0", "lo", etc + index: Optional[int] = None # interface index + state: Optional[str] = None # "UP", "DOWN", "UNKNOWN" + mtu: Optional[int] = None # Maximum Transmission Unit + qdisc: Optional[str] = None # Queuing discipline + mac_address: Optional[str] = None # MAC/hardware address + flags: List[str] = Field(default_factory=list) # ["UP", "BROADCAST", "MULTICAST"] + addresses: List[IpAddress] = Field(default_factory=list) # IP addresses on this interface + + +class Route(BaseModel): + """Routing table entry""" + + destination: str # "default", "192.168.1.0/24", etc + gateway: Optional[str] = None # Gateway IP + device: Optional[str] = None # Network interface + protocol: Optional[str] = None # "kernel", "boot", "static", etc + scope: Optional[str] = None # "link", "global", "host" + metric: Optional[int] = None # Route metric/priority + source: Optional[str] = None # Preferred source address + table: Optional[str] = None # Routing table name/number + + +class RoutingRule(BaseModel): + """Routing policy rule""" + + priority: int # Rule priority + source: Optional[str] = None # Source address/network + destination: Optional[str] = None # Destination address/network + table: Optional[str] = None # Routing table to use + action: Optional[str] = None # "lookup", "unreachable", "prohibit", etc + iif: Optional[str] = None # Input interface + oif: Optional[str] = None # Output interface + fwmark: Optional[str] = None # Firewall mark + + +class Neighbor(BaseModel): + """ARP/Neighbor table entry""" + + ip_address: str # IP address of the neighbor + device: Optional[str] = None # Network interface + mac_address: Optional[str] = None # Link layer (MAC) address + state: Optional[str] = None # "REACHABLE", "STALE", "DELAY", "PROBE", "FAILED", "INCOMPLETE" + flags: List[str] = Field(default_factory=list) # Additional flags like "router", "proxy" + + +class EthtoolInfo(BaseModel): + """Ethtool information for a network interface""" + + interface: str # Interface name this info belongs to + raw_output: str # Raw ethtool command output + settings: Dict[str, str] = Field(default_factory=dict) # Parsed key-value settings + supported_link_modes: List[str] = Field(default_factory=list) # Supported link modes + advertised_link_modes: List[str] = Field(default_factory=list) # Advertised link modes + speed: Optional[str] = None # Link speed (e.g., "10000Mb/s") + duplex: Optional[str] = None # Duplex mode (e.g., "Full") + port: Optional[str] = None # Port type (e.g., "Twisted Pair") + auto_negotiation: Optional[str] = None # Auto-negotiation status (e.g., "on", "off") + link_detected: Optional[str] = None # Link detection status (e.g., "yes", "no") + + +class NetworkDataModel(DataModel): + """Complete network configuration data""" + + interfaces: List[NetworkInterface] = Field(default_factory=list) + routes: List[Route] = Field(default_factory=list) + rules: List[RoutingRule] = Field(default_factory=list) + neighbors: List[Neighbor] = Field(default_factory=list) + ethtool_info: Dict[str, EthtoolInfo] = Field( + default_factory=dict + ) # Interface name -> EthtoolInfo mapping diff --git a/nodescraper/plugins/inband/os/analyzer_args.py b/nodescraper/plugins/inband/os/analyzer_args.py index 366bb8d3..52fd1124 100644 --- a/nodescraper/plugins/inband/os/analyzer_args.py +++ b/nodescraper/plugins/inband/os/analyzer_args.py @@ -61,4 +61,4 @@ def build_from_model(cls, datamodel: OsDataModel) -> "OsAnalyzerArgs": Returns: OsAnalyzerArgs: instance of analyzer args class """ - return cls(exp_os=datamodel.os_name) + return cls(exp_os=datamodel.os_name, exact_match=True) diff --git a/nodescraper/plugins/inband/package/analyzer_args.py b/nodescraper/plugins/inband/package/analyzer_args.py index cbd7ebad..62a34c1f 100644 --- a/nodescraper/plugins/inband/package/analyzer_args.py +++ b/nodescraper/plugins/inband/package/analyzer_args.py @@ -34,7 +34,15 @@ class PackageAnalyzerArgs(AnalyzerArgs): exp_package_ver: Dict[str, Optional[str]] = Field(default_factory=dict) regex_match: bool = False + # rocm_regex is optional and should be specified in plugin_config.json if needed + rocm_regex: Optional[str] = None + enable_rocm_regex: bool = False @classmethod def build_from_model(cls, datamodel: PackageDataModel) -> "PackageAnalyzerArgs": - return cls(exp_package_ver=datamodel.version_info) + # Use custom rocm_regex from collection_args if enable_rocm_regex is true + rocm_regex = None + if datamodel.enable_rocm_regex and datamodel.rocm_regex: + rocm_regex = datamodel.rocm_regex + + return cls(exp_package_ver=datamodel.version_info, rocm_regex=rocm_regex) diff --git a/nodescraper/plugins/inband/package/package_analyzer.py b/nodescraper/plugins/inband/package/package_analyzer.py index 16215086..906b7a08 100644 --- a/nodescraper/plugins/inband/package/package_analyzer.py +++ b/nodescraper/plugins/inband/package/package_analyzer.py @@ -44,7 +44,7 @@ def regex_version_data( package_data: dict[str, str], key_search: re.Pattern[str], value_search: Optional[Pattern[str]], - ) -> bool: + ) -> tuple[bool, list[tuple[str, str, str]]]: """Searches the package values for the key and value search patterns Args: @@ -53,11 +53,14 @@ def regex_version_data( value_search (Optional[Pattern[str]]): a compiled regex pattern to search for the package version, if None then any version is accepted Returns: - bool: A boolean indicating if the value was found + tuple: (value_found, version_mismatches) where value_found is a bool and + version_mismatches is a list of (package_name, expected_pattern, found_version) tuples """ value_found = False + version_mismatches = [] for name, version in package_data.items(): + self.logger.debug("Package data: %s, %s", name, version) key_search_res = key_search.search(name) if key_search_res: value_found = True @@ -65,6 +68,7 @@ def regex_version_data( continue value_search_res = value_search.search(version) if not value_search_res: + version_mismatches.append((name, value_search.pattern, version)) self._log_event( EventCategory.APPLICATION, f"Package {key_search.pattern} Version Mismatch, Expected {value_search.pattern} but found {version}", @@ -76,26 +80,33 @@ def regex_version_data( "found_version": version, }, ) - return value_found + return value_found, version_mismatches def package_regex_search( - self, package_data: dict[str, str], exp_packge_data: dict[str, Optional[str]] + self, package_data: dict[str, str], exp_package_data: dict[str, Optional[str]] ): """Searches the package data for the expected package and version using regex Args: package_data (dict[str, str]): a dictionary of package names and versions - exp_packge_data (dict[str, Optional[str]]): a dictionary of expected package names and versions + exp_package_data (dict[str, Optional[str]]): a dictionary of expected package names and versions + + Returns: + tuple: (not_found_keys, regex_errors, version_mismatches) containing lists of errors """ not_found_keys = [] - for exp_key, exp_value in exp_packge_data.items(): + regex_errors = [] + version_mismatches = [] + + for exp_key, exp_value in exp_package_data.items(): try: if exp_value is not None: value_search = re.compile(exp_value) else: value_search = None key_search = re.compile(exp_key) - except re.error: + except re.error as e: + regex_errors.append((exp_key, exp_value, str(e))) self._log_event( EventCategory.RUNTIME, f"Regex Compile Error either {exp_key} {exp_value}", @@ -107,10 +118,13 @@ def package_regex_search( ) continue - key_found = self.regex_version_data(package_data, key_search, value_search) + key_found, mismatches = self.regex_version_data(package_data, key_search, value_search) + + # Collect version mismatches + version_mismatches.extend(mismatches) if not key_found: - not_found_keys.append(exp_key) + not_found_keys.append((exp_key, exp_value)) self._log_event( EventCategory.APPLICATION, f"Package {exp_key} not found in the package list", @@ -122,47 +136,50 @@ def package_regex_search( "found_version": None, }, ) - return not_found_keys + + return not_found_keys, regex_errors, version_mismatches def package_exact_match( - self, package_data: dict[str, str], exp_packge_data: dict[str, Optional[str]] + self, package_data: dict[str, str], exp_package_data: dict[str, Optional[str]] ): """Checks the package data for the expected package and version using exact match Args: package_data (dict[str, str]): a dictionary of package names and versions - exp_packge_data (dict[str, Optional[str]]): a dictionary of expected package names and versions + exp_package_data (dict[str, Optional[str]]): a dictionary of expected package names and versions """ not_found_match = [] not_found_version = [] - for exp_key, exp_value in exp_packge_data.items(): - self.logger.info(exp_key) + for exp_key, exp_value in exp_package_data.items(): + self.logger.info("Expected value: %s, %s", exp_key, exp_value) version = package_data.get(exp_key) - if exp_value is None: + self.logger.info("Found version: %s", version) + if version is None: + # package not found + not_found_version.append((exp_key, exp_value)) + self._log_event( + EventCategory.APPLICATION, + f"Package {exp_key} not found in the package list", + EventPriority.ERROR, + { + "expected_package": exp_key, + "found_package": None, + "expected_version": exp_value, + "found_version": None, + }, + ) + elif exp_value is None: # allow any version when expected version is None - if version is None: - # package not found - not_found_version.append((exp_key, version)) - self._log_event( - EventCategory.APPLICATION, - f"Package {exp_key} not found in the package list", - EventPriority.ERROR, - { - "expected_package": exp_key, - "found_package": None, - "expected_version": exp_value, - "found_version": None, - }, - ) + continue elif version != exp_value: not_found_match.append((exp_key, version)) self._log_event( EventCategory.APPLICATION, - f"Package {exp_key} Version Mismatch, Expected {exp_key} but found {version}", + f"Package {exp_key} Version Mismatch, Expected {exp_value} but found {version}", EventPriority.ERROR, { "expected_package": exp_key, - "found_package": exp_key if version else None, + "found_package": exp_key, "expected_version": exp_value, "found_version": version, }, @@ -187,10 +204,45 @@ def analyze_data( return self.result if args.regex_match: - not_found_keys = self.package_regex_search(data.version_info, args.exp_package_ver) - self.result.message = f"Packages not found: {not_found_keys}" - self.result.status = ExecutionStatus.ERROR + not_found_keys, regex_errors, version_mismatches = self.package_regex_search( + data.version_info, args.exp_package_ver + ) + + # Adding details for err message + error_parts = [] + if not_found_keys: + packages_detail = ", ".join( + [ + f"'{pkg}' (expected version: {ver if ver else 'any'})" + for pkg, ver in not_found_keys + ] + ) + error_parts.append(f"Packages not found: {packages_detail}") + + if regex_errors: + regex_detail = ", ".join( + [f"'{pkg}' pattern (version: {ver})" for pkg, ver, _ in regex_errors] + ) + error_parts.append(f"Regex compile errors: {regex_detail}") + + if version_mismatches: + version_detail = ", ".join( + [ + f"'{pkg}' (expected: {exp}, found: {found})" + for pkg, exp, found in version_mismatches + ] + ) + error_parts.append(f"Version mismatches: {version_detail}") + + total_errors = len(not_found_keys) + len(regex_errors) + len(version_mismatches) + if total_errors > 0: + self.result.message = f"{'; '.join(error_parts)}" + self.result.status = ExecutionStatus.ERROR + else: + self.result.message = "All packages found and versions matched" + self.result.status = ExecutionStatus.OK else: + self.logger.info("Expected packages: %s", list(args.exp_package_ver.keys())) not_found_match, not_found_version = self.package_exact_match( data.version_info, args.exp_package_ver ) diff --git a/nodescraper/plugins/inband/package/package_collector.py b/nodescraper/plugins/inband/package/package_collector.py index bc3a65b3..a6cf42fe 100644 --- a/nodescraper/plugins/inband/package/package_collector.py +++ b/nodescraper/plugins/inband/package/package_collector.py @@ -34,10 +34,11 @@ from nodescraper.models import TaskResult from nodescraper.utils import get_exception_details +from .analyzer_args import PackageAnalyzerArgs from .packagedata import PackageDataModel -class PackageCollector(InBandDataCollector[PackageDataModel, None]): +class PackageCollector(InBandDataCollector[PackageDataModel, PackageAnalyzerArgs]): """Collecting Package information from the system""" DATA_MODEL = PackageDataModel @@ -181,9 +182,34 @@ def _handle_command_failure(self, command_artifact: CommandArtifact): self.result.message = "Failed to run Package Manager command" self.result.status = ExecutionStatus.EXECUTION_FAILURE - def collect_data(self, args=None) -> tuple[TaskResult, Optional[PackageDataModel]]: + def _filter_rocm_packages(self, packages: dict[str, str], rocm_pattern: str) -> dict[str, str]: + """Filter ROCm-related packages from a package dictionary. + + This method searches package names for ROCm-related patterns and returns + only the matching packages. + + Args: + packages (dict[str, str]): Dictionary with package names as keys and versions as values. + rocm_pattern (str): Regex pattern to match ROCm-related package names. + + Returns: + dict[str, str]: Filtered dictionary containing only ROCm-related packages. + """ + rocm_packages = {} + pattern = re.compile(rocm_pattern, re.IGNORECASE) + for package_name, version in packages.items(): + if pattern.search(package_name): + rocm_packages[package_name] = version + return rocm_packages + + def collect_data( + self, args: Optional[PackageAnalyzerArgs] = None + ) -> tuple[TaskResult, Optional[PackageDataModel]]: """Collect package information from the system. + Args: + args (Optional[PackageAnalyzerArgs]): Optional arguments containing ROCm regex pattern. + Returns: tuple[TaskResult, Optional[PackageDataModel]]: tuple containing the task result and a PackageDataModel instance with the collected package information, or None if there was an error. @@ -205,8 +231,36 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[PackageDataModel self.result.message = "Unsupported OS" self.result.status = ExecutionStatus.NOT_RAN return self.result, None + + # Filter and log ROCm packages if on Linux and rocm_regex is provided + if self.system_info.os_family == OSFamily.LINUX and packages: + # Get ROCm pattern from args if provided + rocm_pattern = args.rocm_regex if args else None + if rocm_pattern: + self.logger.info("Using rocm_pattern: %s", rocm_pattern) + rocm_packages = self._filter_rocm_packages(packages, rocm_pattern) + if rocm_packages: + self.result.message = ( + f"Found {len(rocm_packages)} ROCm-related packages installed" + ) + self.result.status = ExecutionStatus.OK + self._log_event( + category=EventCategory.OS, + description=f"Found {len(rocm_packages)} ROCm-related packages installed", + priority=EventPriority.INFO, + data={"rocm_packages": sorted(rocm_packages.keys())}, + ) + else: + self.logger.info("No rocm_regex provided, skipping ROCm package filtering") + + # Extract rocm_regex and enable_rocm_regex from args if provided + rocm_regex = args.rocm_regex if (args and args.rocm_regex) else "" + enable_rocm_regex = getattr(args, "enable_rocm_regex", False) if args else False + try: - package_model = PackageDataModel(version_info=packages) + package_model = PackageDataModel( + version_info=packages, rocm_regex=rocm_regex, enable_rocm_regex=enable_rocm_regex + ) except ValidationError as val_err: self._log_event( category=EventCategory.RUNTIME, diff --git a/nodescraper/plugins/inband/package/packagedata.py b/nodescraper/plugins/inband/package/packagedata.py index c1943307..ea95e1c5 100644 --- a/nodescraper/plugins/inband/package/packagedata.py +++ b/nodescraper/plugins/inband/package/packagedata.py @@ -32,6 +32,10 @@ class PackageDataModel(DataModel): Attributes: version_info (dict[str, str]): The version information for the package Key is the package name and value is the version of the package + rocm_regex (str): Regular expression pattern for ROCm package filtering + enable_rocm_regex (bool): Whether to use custom ROCm regex from collection_args """ version_info: dict[str, str] + rocm_regex: str = "" + enable_rocm_regex: bool = False diff --git a/nodescraper/plugins/inband/pcie/__init__.py b/nodescraper/plugins/inband/pcie/__init__.py new file mode 100644 index 00000000..baeb9851 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/__init__.py @@ -0,0 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .analyzer_args import PcieAnalyzerArgs +from .pcie_plugin import PciePlugin + +__all__ = ["PciePlugin", "PcieAnalyzerArgs"] diff --git a/nodescraper/plugins/inband/pcie/analyzer_args.py b/nodescraper/plugins/inband/pcie/analyzer_args.py new file mode 100644 index 00000000..dc3490a4 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/analyzer_args.py @@ -0,0 +1,63 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Dict, Optional, Union + +from nodescraper.models import AnalyzerArgs + + +class PcieAnalyzerArgs(AnalyzerArgs): + """Arguments for PCIe analyzer + + Attributes: + exp_speed: Expected PCIe speed (generation 1-5) + exp_width: Expected PCIe width (1-16 lanes) + exp_sriov_count: Expected SR-IOV VF count + exp_gpu_count_override: Override expected GPU count + exp_max_payload_size: Expected max payload size (int for all devices, dict for specific device IDs) + exp_max_rd_req_size: Expected max read request size (int for all devices, dict for specific device IDs) + exp_ten_bit_tag_req_en: Expected 10-bit tag request enable (int for all devices, dict for specific device IDs) + """ + + exp_speed: int = 5 + exp_width: int = 16 + exp_sriov_count: int = 0 + exp_gpu_count_override: Optional[int] = None + exp_max_payload_size: Optional[Union[Dict[int, int], int]] = None + exp_max_rd_req_size: Optional[Union[Dict[int, int], int]] = None + exp_ten_bit_tag_req_en: Optional[Union[Dict[int, int], int]] = None + + +def normalize_to_dict( + value: Optional[Union[Dict[int, int], int]], vendorid_ep: int +) -> Dict[int, int]: + """Normalize int or dict values to dict format using vendorid_ep as key for int values""" + if value is None: + return {} + if isinstance(value, int): + return {vendorid_ep: value} + if isinstance(value, dict): + return value + return {} diff --git a/nodescraper/plugins/inband/pcie/pcie_analyzer.py b/nodescraper/plugins/inband/pcie/pcie_analyzer.py new file mode 100755 index 00000000..7d9a7e58 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_analyzer.py @@ -0,0 +1,1081 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Dict, List, Optional, Set, Type, TypeVar + +from pydantic import BaseModel, Field, ValidationError, field_validator + +from nodescraper.enums import EventCategory, EventPriority +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_traceback + +from .analyzer_args import PcieAnalyzerArgs, normalize_to_dict +from .pcie_data import ( + BdfStr, + CorrErrMaskReg, + CorrErrStatReg, + ECap16Gt, + ECapAer, + ECapSecpci, + ParityMisMatchStat16GT, + PcieCapStructure, + PcieCfgSpace, + PcieDataModel, + PcieExp, + PcieRegister, + UncorrErrMaskReg, + UncorrErrSevReg, + UncorrErrStatReg, +) + +T_CAP = TypeVar("T_CAP", bound=PcieCapStructure) + + +class PcieAnalyzerInputModel(BaseModel): + """ + PCIeAnalyzerInputModel is a data model for validating and storing input parameters + related to PCIe (Peripheral Component Interconnect Express) analysis. + Attributes: + exp_speed (int): Expected PCIe speed, Speed is the PCIe Generation, constrained to values between 1 and 5 (inclusive). + exp_width (int): Expected PCIe width, constrained to values between 1 and 16 (inclusive). + exp_sriov_count (Optional[int]): Optional expected count of SR-IOV (Single Root I/O Virtualization) instances. + exp_gpu_count_override (Optional[int]): Optional override for the expected GPU count. + """ + + exp_speed: int = Field(ge=1, le=5) + exp_width: int = Field(ge=1, le=16) + exp_sriov_count: Optional[int] = None + exp_gpu_count_override: Optional[int] = None + exp_max_payload_size: Dict[int, int] = Field(default_factory=dict) + exp_max_rd_req_size: Dict[int, int] = Field(default_factory=dict) + exp_ten_bit_tag_req_en: Dict[int, int] = Field(default_factory=dict) + + @field_validator("exp_max_rd_req_size", "exp_max_payload_size", mode="before") + @classmethod + def validate_exp_max_rd_req_size(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]: + """Validates the expected maximum read request size.""" + if v is None: + return {} + ret_dict = v.copy() + for key, value in v.items(): + if value >= 0 and value <= 5: + ret_dict[key] = 128 << value # Convert to actual size in bytes + if value not in {128, 256, 512, 1024, 2048, 4096}: + raise ValueError( + "Expected max read request size must be one of: " + "1, 2, 3, 4, 5, 128, 256, 512, 1024, 2048, or 4096." + ) + if key < 0 or key > 0xFFFF: + raise ValueError(" key must be a valid BDF (0-65535).") + return ret_dict + + @field_validator("exp_ten_bit_tag_req_en", mode="before") + @classmethod + def validate_exp_ten_bit_tag_req_en(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]: + """Validates the expected 10-bit tag request enable value.""" + if v is None: + return {} + for key, value in v.items(): + if key < 0 or key > 0xFFFF: + raise ValueError("Key must be a valid BDF (0-65535).") + if value not in {0, 1}: + raise ValueError("Expected 10-bit tag request enable must be 0 or 1.") + return v + + +class PcieAnalyzer(DataAnalyzer): + """Check PCIe Data for errors + + This calls checks the following: + - PCIe link status for each BDF + - This checks if the link speed and width are as expected + - AER uncorrectable errors + - Checks PCIe AER uncorrectable error registers UNCORR_ERR_STAT_REG and reports any errors + - AER correctable errors + - Checks the AERs correctable error registers CORR_ERR_STAT_REG and reports any errors + - PCIe device status errors + - Checks PCIe device status errors reported in fields `CORR_ERR_DET` `NON_FATAL_ERR_DET` `FATAL_ERR_DET` `UR_DET` + - PCIe status errors + - Checks PCIe status errors reported in fields `MSTR_DATA_PAR_ERR` `SIGNALED_TARGET_ABORT` `RCVD_TARGET_ABORT` + `RCVD_MSTR_ABORT` `SIGNALED_SYS_ERR` `DET_PARITY_ERR` + + """ + + DATA_MODEL = PcieDataModel + + GPU_BRIDGE_USP_ID = "0x1501" + GPU_BRIDGE_DSP_ID = "0x1500" + + def validate_reg(self, bdf: str, reg: PcieRegister, log_event: bool) -> bool: + """Ensures that the register has no error has has a value + + Parameters + ---------- + bdf : str + base:device:function string just used for logging + reg : PcieRegister + Register to validate + log_event : bool + Whether to log an event if the register is invalid + + Returns + ------- + bool + True when validate successfully, False otherwise + """ + if reg.val is None or reg.err is not None: + if log_event: + self._log_event( + category=EventCategory.IO, + description="No value assgined to register or register collection resulted in error", + priority=EventPriority.WARNING, + data={"value": reg.val, "error": reg.err, "bdf": bdf}, + ) + return False + return True + + def validate_cap( + self, + bdf: str, + name: str, + capability_structure: Optional[PcieCapStructure], + log_event: bool = True, + ) -> bool: + """Ensures that the capability structure has no error and exists + + Parameters + ---------- + bdf : str + base:device:function string just used for logging + capability_structure : PcieCapStructure + Capability structure to validate + + Returns + ------- + bool + True when validate successfully, False otherwise + """ + if capability_structure is None: + if log_event: + self._log_event( + category=EventCategory.IO, + description="No value assgined to capability a structure ", + data={ + "name": name, + "bdf": bdf, + }, + priority=EventPriority.WARNING, + ) + return False + null_regs = capability_structure.null_err_regs() + if null_regs: + if log_event: + self._log_event( + category=EventCategory.IO, + description="Capability structure has unset registers", + data={ + "name": name, + "bdf": bdf, + "capability_structure": capability_structure, + "null_regs": null_regs, + }, + priority=EventPriority.WARNING, + ) + return False + return True + + def validate_cap_dict( + self, + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace], + cap_struct: Type[PcieCapStructure], + log_event: bool = True, + ) -> set[str]: + """Validates capability structures for all BDFs in the PCIe data + + Parameters + ---------- + pcie_data : PCIeData + The PCIe data containing configuration space for each BDF + cap_struct : Type[PcieCapStructure] + The capability structure type to validate against each BDF's configuration space + log_event : bool, optional + Whether to log an event if a BDF does not have the specified capability structure, by default True + + Returns + ------- + set[str] + A set of BDFs that have the specified capability structure + """ + bdf_without_cap_struct = set() + for bdf, cfg_space in pcie_cfg_space.items(): + cap_struct_data = cfg_space.get_struct(cap_struct) + if not self.validate_cap(bdf, cap_struct.__name__, cap_struct_data, False): + bdf_without_cap_struct.add(bdf) + if log_event and len(bdf_without_cap_struct) > 0: + self._log_event( + category=EventCategory.IO, + description=f"Capability Structure {cap_struct.__name__} not found in a Cfg Space", + priority=EventPriority.WARNING, + data={ + "bdf_without_pcie_exp": list(bdf_without_cap_struct), + "num_bdfs_with_invalid_capability_structure": len(bdf_without_cap_struct), + "total_bdfs": len(pcie_cfg_space), + }, + ) + return set(pcie_cfg_space.keys()) - bdf_without_cap_struct + + def get_valid_cap_dict( + self, + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace], + cap_struct: Type[T_CAP], + log_event: bool = True, + ) -> dict[BdfStr, T_CAP]: + """Returns a dictionary of BDFs that have the specified capability structure + + Parameters + ---------- + pcie_data : PCIeData + The PCIe data containing configuration space for each BDF + cap_struct : Type[T_CAP] + The capability structure type to validate against each BDF's configuration space + log_event : bool, optional + Whether to log an event if a BDF does not have the specified capability structure, by default True + + Returns + ------- + dict[BdfStr, T_CAP] + A dictionary of BDFs that have the specified capability structure + """ + bdfs_with_cap = self.validate_cap_dict(pcie_cfg_space, cap_struct, log_event=log_event) + bdf_cap_struct_dict: Dict[BdfStr, T_CAP] = {} + for bdf, cfg_space in pcie_cfg_space.items(): + if bdf not in bdfs_with_cap: + continue + cap_struct_data = cfg_space.get_struct(cap_struct) + if cap_struct_data is None: + continue + bdf_cap_struct_dict[bdf] = cap_struct_data + + return bdf_cap_struct_dict + + def check_link_status( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_speed: int = 5, + exp_width: int = 16, + ): + """Checks PCIe link status for each bdf in the bdf_list and compares with the expected rate/width + + Args: + all_bdf_cfg_space (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_speed (int): expected link speed + exp_width (int): expected link width + + Returns: + None + """ + # Key: binary bit position, value: Gen + sv_gen_speed = { + 0b000000: 0, + 0b000001: 1, + 0b000010: 2, + 0b000100: 3, + 0b001000: 4, + 0b010000: 5, + } + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + lnk_stat_reg = pcie_exp.lnk_stat_reg + lnk_cap_2_reg = pcie_exp.lnk_cap_2_reg + try: + if lnk_stat_reg.curr_lnk_speed.val == 0: + self._log_event( + category=EventCategory.IO, + description="Link speed vector is 0", + data={ + "bdf": bdf, + "curr_lnk_speed": lnk_stat_reg.curr_lnk_speed.val, + "supported_lnk_speed_vec": lnk_cap_2_reg.supported_lnk_speed_vec.val, + }, + priority=EventPriority.ERROR, + ) + continue + + curr_speed = lnk_stat_reg.curr_lnk_speed.get_val() + supported_vec = lnk_cap_2_reg.supported_lnk_speed_vec.get_val() + if curr_speed is None or supported_vec is None: + continue + sv_mask = 0b1 << (curr_speed - 1) + link_speed = sv_gen_speed[sv_mask & supported_vec] + + if link_speed != exp_speed: + self._log_event( + category=EventCategory.IO, + description="Unexpected link speed detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_speed": link_speed, + "expected_speed": exp_speed, + }, + ) + if lnk_stat_reg.neg_lnk_width.get_val() != exp_width: + self._log_event( + category=EventCategory.IO, + description="Unexpected link width detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_width": lnk_stat_reg.neg_lnk_width.get_val(), + "expected_width": exp_width, + }, + ) + except Exception as e: + self._log_event( + category=EventCategory.IO, + description="Exception occurred while checking link status", + priority=EventPriority.ERROR, + data={"exception": get_exception_traceback(e)}, + ) + + def check_uncorr_aer_errors( + self, + bdf_ecap_aer: Dict[BdfStr, ECapAer], + ): + """ + Checks the following AER uncorrectable error registers + - Uncorrectable Error Status Register + - Uncorrectable Error Mask Register + - Uncorrectable Error Severity Register + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, ecap_aer in bdf_ecap_aer.items(): + stat_reg: UncorrErrStatReg = ecap_aer.uncorr_err_stat + mask_reg: UncorrErrMaskReg = ecap_aer.uncorr_err_mask + sev_reg: UncorrErrSevReg = ecap_aer.uncorr_err_sev + stat_fields = stat_reg.bit_fields + mask_fields = mask_reg.bit_fields + sev_fields = sev_reg.bit_fields + # sort fields by bit position using offset + sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask) + sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask) + sorted_sev_fields = sorted(sev_fields.values(), key=lambda x: x.bit_mask) + # Iterate through all the fields in the stat, mask, and sev registers + for stat_field, mask_field, sev_field in zip( + sorted_stat_fields, + sorted_mask_fields, + sorted_sev_fields, + ): + pcie_field_stat_value = stat_field.get_val() + pcie_field_mask_value = mask_field.get_val() + pcie_field_sev_value = sev_field.get_val() + err_descriptor: Dict[str, str] = { + "bdf": bdf, + "reg_name": stat_reg.__class__.__name__, + "field_desc": stat_field.desc, + "stat": ( + hex(pcie_field_stat_value) if pcie_field_stat_value is not None else "None" + ), + "mask": ( + hex(pcie_field_mask_value) if pcie_field_mask_value is not None else "None" + ), + "sev": ( + hex(pcie_field_sev_value) if pcie_field_sev_value is not None else "None" + ), + } + if pcie_field_stat_value != 0: + # Error detected + if pcie_field_sev_value != 1: + if pcie_field_mask_value == 1: + self._log_event( + category=EventCategory.IO, + description="Masked Fatal errors were detected", + priority=EventPriority.ERROR, + data=err_descriptor, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Unmasked Fatal errors were detected", + priority=EventPriority.ERROR, + data=err_descriptor, + ) + else: + if pcie_field_mask_value == 1: + self._log_event( + category=EventCategory.IO, + description="Unmasked Non-Fatal errors were detected", + priority=EventPriority.WARNING, + data=err_descriptor, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Unmasked Non-Fatal errors were detected", + priority=EventPriority.WARNING, + data=err_descriptor, + ) + + def check_corr_aer_errors( + self, + bdf_ecap_aer: Dict[BdfStr, ECapAer], + ): + """ + Checks the following AER correctable error registers + - Correctable Error Status Register + - Correctable Error Mask Register + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, ecap_aer in bdf_ecap_aer.items(): + stat_reg: CorrErrStatReg = ecap_aer.corr_err_stat + mask_reg: CorrErrMaskReg = ecap_aer.corr_err_mask + stat_fields = stat_reg.bit_fields + mask_fields = mask_reg.bit_fields + sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask) + sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask) + + for stat_field, mask_field in zip( + sorted_stat_fields, + sorted_mask_fields, + ): + stat_val = stat_field.get_val() + if stat_val is not None and stat_val != 0: + err_dict = { + "bdf": bdf, + "reg_description": stat_reg.desc, + "field_description": stat_field.desc, + "bit_field_val": hex(stat_val), + } + if mask_field.get_val() == 1: + self._log_event( + category=EventCategory.IO, + description="Masked Correctable errors were detected", + priority=EventPriority.WARNING, + data=err_dict, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Masked Correctable errors were detected", + priority=EventPriority.ERROR, + data=err_dict, + ) + + def check_pcie_device_status_errors(self, bdf_pcie_express_dict: Dict[str, PcieExp]): + """ + Checks PCIe baseline error reported in Device Status Register + Reference: 9.4.1 Baseline Error Reporting + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, pcie_exp_cap in bdf_pcie_express_dict.items(): + err_list = [] + dev_stat_reg = pcie_exp_cap.dev_stat_reg + bit_field_list = [ + dev_stat_reg.corr_err_det, + dev_stat_reg.non_fatal_err_det, + dev_stat_reg.fatal_err_det, + dev_stat_reg.ur_det, + ] + err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0] + + if len(err_list) > 0: + self._log_event( + category=EventCategory.IO, + description="Device Status errors were detected", + priority=EventPriority.WARNING, + data={ + "bdf": bdf, + "reg_description": dev_stat_reg.desc, + "field_desc_list": [err.desc for err in err_list], + "err_bitmask_list": [err.bit_mask for err in err_list], + "register_value": dev_stat_reg.val, + }, + ) + + def check_pcie_status_errors(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]): + """ + Checks PCIe baseline error reported in Status Registe + Reference: 9.4.1 Baseline Error Reporting + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, cfg_space in bdf_cfg_space_dict.items(): + err_list = [] + stat_reg = cfg_space.type_0_configuration.status + bit_field_list = [ + stat_reg.mstr_data_par_err, + stat_reg.signaled_target_abort, + stat_reg.rcvd_target_abort, + stat_reg.rcvd_mstr_abort, + stat_reg.signaled_sys_err, + stat_reg.det_parity_err, + ] + err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0] + + if len(err_list) > 0: + self._log_event( + category=EventCategory.IO, + description="PCI Express Status register errors were detected", + priority=EventPriority.WARNING, + data={ + "bdf": bdf, + "reg_description": stat_reg.desc, + "field_desc_list": [err.desc for err in err_list], + "err_bitmask_list": [err.bit_mask for err in err_list], + "register_value": stat_reg.val, + }, + ) + + def check_pcie_dev_ctrl_reg( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_max_payload_size: Optional[int], + exp_max_rd_req_size: Optional[int], + ): + """Checks 7.5.3.4 Device Control Register (Offset 08h) fields for expected value: + - Max Payload Size + - Max Read Request Size + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_max_payload_size (Optional[int]): expected max payload size, when None it is not checked + exp_max_rd_req_size (Optional[int]): expected max read request size, when None it is not checked + Returns: + None + """ + encoding = { + 0b000: 128, + 0b001: 256, + 0b010: 512, + 0b011: 1024, + 0b100: 2048, + 0b101: 4096, + } + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + dev_ctrl_reg = pcie_exp.dev_ctrl_reg + mps_val = dev_ctrl_reg.mps.get_val() + if mps_val is None: + continue + max_payload_size = encoding[mps_val] + if exp_max_payload_size is not None and max_payload_size != exp_max_payload_size: + self._log_event( + category=EventCategory.IO, + description="Unexpected Max Payload Size detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_max_payload_size": max_payload_size, + "expected_max_payload_size": exp_max_payload_size, + }, + ) + + max_rd_req_val = dev_ctrl_reg.max_rd_req_size.get_val() + if max_rd_req_val is None: + continue + max_rd_req_size = encoding[max_rd_req_val] + if max_rd_req_size is not None and max_rd_req_size != exp_max_rd_req_size: + self._log_event( + category=EventCategory.IO, + description="Unexpected Max Read Request Size detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_max_rd_req_size": max_rd_req_size, + "expected_max_rd_req_size": exp_max_rd_req_size, + }, + ) + + def check_pcie_dev_ctrl_2_reg( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_ten_bit_tag_req_en: Optional[int], + ): + """Checks 7.5.3.16 Device Control 2 Register (Offset 28h) fields for expected value: + - 10-bit Tag Request Enable + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_ten_bit_tag_req_en (Optional[int]): expected 10-bit tag request enable, when None it is not checked + Returns: + None + """ + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + dev_ctrl_2_reg = pcie_exp.dev_ctrl_2_reg + ten_bit_tag_req_en = dev_ctrl_2_reg.ten_bit_tag_req_en.get_val() + if exp_ten_bit_tag_req_en is not None and ten_bit_tag_req_en != exp_ten_bit_tag_req_en: + self._log_event( + category=EventCategory.IO, + description="Unexpected 10-bit Tag Request Enable detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_ten_bit_tag_req_en": ten_bit_tag_req_en, + "expected_ten_bit_tag_req_en": exp_ten_bit_tag_req_en, + }, + ) + + def instantaneous_par_err_chk(self, bdf_cfg_space_dict: Dict[str, ECap16Gt]): + """Instantaneous parity error check for ECap16Gt registers, will + log an event if any lanes have parity errors. + + Parameters + ---------- + bdf_cfg_space_dict : Dict[str, ECap16Gt] + Dictionary of BDFs and their corresponding ECap16Gt capability structure + """ + for bdf, ecap_pl_16gt in bdf_cfg_space_dict.items(): + par_mismatch_stat: ParityMisMatchStat16GT = ecap_pl_16gt.parity_mismatch_stat + retimer_fst_par_mismatch_stat = ecap_pl_16gt.retimer_fst_parity_mismatch_stat + for parity_register in [ + par_mismatch_stat, + retimer_fst_par_mismatch_stat, + ]: + if parity_register.val is None: + continue + par_bad_lanes = [ + 1 if (parity_register.val >> bit) & 1 else 0 for bit in range(0, 32) + ] + number_of_bad_lanes = sum(par_bad_lanes) + if number_of_bad_lanes > 0: + self._log_event( + category=EventCategory.IO, + description="Lanes have parity errors", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "reg_name": parity_register.__class__.__name__, + "reg_desc": parity_register.desc, + "register_value": parity_register.val, + "number_of_bad_lanes": number_of_bad_lanes, + }, + ) + + def lane_error_status_chk(self, ecap_sec_pci_dict: Dict[str, ECapSecpci]): + """Lane error status check for ECapSecpci registers, will log an event if any lanes have errors. + + Parameters + ---------- + ecap_sec_pci_dict : Dict[str, ECapSecpci] + Dictionary of BDFs and their corresponding ECapSecpci capability structure + """ + for bdf, ecap_sec_pci in ecap_sec_pci_dict.items(): + lane_error_stat = ecap_sec_pci.lane_err_stat + lane_error_stat_val = lane_error_stat.val + if lane_error_stat_val != 0: + self._log_event( + category=EventCategory.IO, + description="Lane error detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "reg_name": lane_error_stat.__class__.__name__, + "register_value": lane_error_stat_val, + }, + ) + + def device_consistancy_chk(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]): + """Checks that the configurable fields in the PCIe devices are all consistent""" + # Build a dynamic map of device IDs to BDFs from the actual devices in the system + dev_id_bdf_map: Dict[int, List[BdfStr]] = {} + + for bdf, cfg_space in bdf_cfg_space_dict.items(): + # Collect Unique device Ids contained in this system + device_id = cfg_space.type_0_configuration.device_id.val + if device_id is None: + self._log_event( + category=EventCategory.IO, + description="No value assigned to device id, unable to check consistency due to missing data", + data={ + "bdf": bdf, + }, + priority=EventPriority.WARNING, + ) + continue + + # Dynamically add device IDs as we encounter them + if device_id not in dev_id_bdf_map: + dev_id_bdf_map[device_id] = [] + dev_id_bdf_map[device_id].append(bdf) + + # check the values are all equal for select registers + cap_struct_dict = self.get_valid_cap_dict(bdf_cfg_space_dict, PcieExp, log_event=False) + for collected_device_id, list_of_bdfs in dev_id_bdf_map.items(): + # check the values are all equal for select registers + mps = [] + mrs = [] + tbt = [] + log_event = False + for bdf in list_of_bdfs: + if bdf not in cap_struct_dict: + # Missing Capability structure for this BDF, skip it, log event at end + log_event = True + continue + pcie_exp = cap_struct_dict[bdf] + dev_ctrl_reg = pcie_exp.dev_ctrl_reg + mps.append(dev_ctrl_reg.mps.val) + mrs.append(dev_ctrl_reg.max_rd_req_size.val) + tbt.append(dev_ctrl_reg.ext_tag_field_en.val) + # check the values are all equal for select registers + if len(set(mps)) > 1 or len(set(mrs)) > 1 or len(set(tbt)) > 1 or log_event: + collected_device_id_str = hex(collected_device_id) + self._log_event( + category=EventCategory.IO, + description=f"PCIe device {collected_device_id_str} has inconsistent values", + priority=EventPriority.WARNING, + data={ + "dev_id": collected_device_id_str, + "bdf_list": list_of_bdfs, + "max_payload_size_list": mps, + "max_rd_req_size_list": mrs, + "ext_tag_field_en_list": tbt, + }, + ) + + def check_ecap_16gt_regs( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECap16Gt capability structure and checks for instantaneous parity errors""" + CAP_STRUCTURE = ECap16Gt + bdf_ecap_16gt_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.instantaneous_par_err_chk(bdf_cfg_space_dict=bdf_ecap_16gt_dict) + + def check_ecap_sec_pci_regs( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECapSecpci capability structure and checks for lane errors""" + CAP_STRUCTURE = ECapSecpci + bdf_ecap_secondary_pci = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.lane_error_status_chk(ecap_sec_pci_dict=bdf_ecap_secondary_pci) + + def check_ecap_aer_errors( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECapAer capability structure and checks for AER errors""" + CAP_STRUCTURE = ECapAer + bdf_ecap_aer_error = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.check_uncorr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error) + self.check_corr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error) + + def check_pcie_exp_capability_structure_errors( + self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] + ): + """Checks the PCIe Express capability structure for errors""" + CAP_STRUCTURE = PcieExp + bdf_pcie_express_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=False + ) + self.check_pcie_device_status_errors(bdf_pcie_express_dict=bdf_pcie_express_dict) + + def check_pcie_exp_capability_structure_config( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + exp_max_payload_size: Optional[int] = None, + exp_max_rd_req_size: Optional[int] = None, + exp_ten_bit_tag_req_en: Optional[int] = None, + ): + """Checks the PCIe Express capability structure for errors""" + CAP_STRUCTURE = PcieExp + + bdf_pcie_express_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + + if exp_max_payload_size is not None or exp_max_rd_req_size is not None: + self.check_pcie_dev_ctrl_reg( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_max_payload_size=exp_max_payload_size, + exp_max_rd_req_size=exp_max_rd_req_size, + ) + + if exp_ten_bit_tag_req_en is not None: + self.check_pcie_dev_ctrl_2_reg( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en, + ) + + @staticmethod + def filter_pcie_data_by_device_id( + bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace], + device_ids: Set[int], + ) -> Dict[BdfStr, PcieCfgSpace]: + """Filters the PCIe data by device ID + + Parameters + ---------- + device_ids : set[int] + Set of device IDs to filter by + + Returns + ------- + Dict[BdfStr, PcieCfgSpace] + Dictionary of BDFs and their corresponding PCIe configuration space + """ + new_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] = {} + for bdf, pcie_data in bdf_cfg_space_dict.items(): + dev_id = pcie_data.type_0_configuration.device_id.val + if dev_id in device_ids: + new_cfg_space_dict[bdf] = pcie_data + return new_cfg_space_dict + + def check_gpu_count( + self, + pcie_data: PcieDataModel, + expected_gpu_count: Optional[int] = None, + ): + """Check if GPU count from PCIe data matches expected count + + Parameters + ---------- + pcie_data : PcieDataModel + PCIe data model containing collected PCIe configuration space data + expected_gpu_count : Optional[int], optional + Expected GPU count, by default None (no check performed) + """ + if expected_gpu_count is None: + return + + gpu_count_from_pcie = 0 + for cfg_space in pcie_data.pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + if vendor_id == self.system_info.vendorid_ep: + gpu_count_from_pcie += 1 + + if gpu_count_from_pcie != expected_gpu_count: + self._log_event( + category=EventCategory.IO, + description="GPU count mismatch", + priority=EventPriority.ERROR, + data={ + "gpu_count_from_pcie": gpu_count_from_pcie, + "expected_gpu_count": expected_gpu_count, + }, + ) + else: + self._log_event( + category=EventCategory.IO, + description="GPU count matches expected", + priority=EventPriority.INFO, + data={ + "gpu_count": gpu_count_from_pcie, + }, + ) + + def analyze_data( + self, data: PcieDataModel, args: Optional[PcieAnalyzerArgs] = None + ) -> TaskResult: + """Check PCIe data for errors by analyzing the PCIe register space and + checking the enumeration of the GPUs and optional SR-IOV VFs + + Parameters + ---------- + data : PcieDataModel + PCIe data model containing collected PCIe configuration space data + args : Optional[PcieAnalyzerArgs], optional + Analyzer arguments containing expected values for validation, by default None + + Returns + ------- + TaskResult + Result of the analysis + """ + if args is None: + args = PcieAnalyzerArgs() + + exp_speed = args.exp_speed + exp_width = args.exp_width + exp_sriov_count = args.exp_sriov_count + exp_gpu_count_override = args.exp_gpu_count_override + exp_max_payload_size = normalize_to_dict( + args.exp_max_payload_size, self.system_info.vendorid_ep + ) + exp_max_rd_req_size = normalize_to_dict( + args.exp_max_rd_req_size, self.system_info.vendorid_ep + ) + exp_ten_bit_tag_req_en = normalize_to_dict( + args.exp_ten_bit_tag_req_en, self.system_info.vendorid_ep + ) + try: + pcie_input_data = PcieAnalyzerInputModel( + exp_speed=exp_speed, + exp_width=exp_width, + exp_sriov_count=exp_sriov_count, + exp_gpu_count_override=exp_gpu_count_override, + exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en, + exp_max_payload_size=exp_max_payload_size, + exp_max_rd_req_size=exp_max_rd_req_size, + ) + except ValidationError as val_error: + self._log_event( + category=EventCategory.RUNTIME, + description="User input for PcieAnalyzerModel is invalid", + priority=EventPriority.ERROR, + data={ + "validation_error": get_exception_traceback(val_error), + "valid_input": { + "exp_speed": "int, 1-5", + "exp_width": "int, 1-16", + "exp_sriov_count": "Optional[int]", + "exp_gpu_count_override": "Optional[int]", + }, + "actual_input": { + "exp_speed": exp_speed, + "exp_width": exp_width, + "exp_sriov_count": exp_sriov_count, + "exp_gpu_count_override": exp_gpu_count_override, + }, + }, + ) + return self.result + + pcie_data: PcieDataModel = data + + if pcie_data.pcie_cfg_space == {} and pcie_data.vf_pcie_cfg_space == {}: + # If both of the PCIe Configuration spaces are + self._log_event( + category=EventCategory.IO, + description="No PCIe config space found", + priority=EventPriority.WARNING, + ) + return self.result + + # Check every link in the PCIe configuration space for the expected capability structure, + # but don't check VF since those will be 0 + bdf_pcie_express_dict = self.get_valid_cap_dict( + pcie_data.pcie_cfg_space, + PcieExp, + log_event=True, + ) + self.check_link_status( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_speed=exp_speed, + exp_width=exp_width, + ) + + amd_device_ids = set() + for cfg_space in pcie_data.pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + device_id = cfg_space.type_0_configuration.device_id.val + if vendor_id == self.system_info.vendorid_ep and device_id is not None: + amd_device_ids.add(device_id) + + # Filter PCIe data for AMD GPUs + oam_pcie_data = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids=amd_device_ids, + ) + + amd_vf_device_ids = set() + if pcie_data.vf_pcie_cfg_space is not None: + for cfg_space in pcie_data.vf_pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + device_id = cfg_space.type_0_configuration.device_id.val + if vendor_id == self.system_info.vendorid_ep and device_id is not None: + amd_vf_device_ids.add(device_id) + + oam_vf_pcie_data = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.vf_pcie_cfg_space, + device_ids=amd_vf_device_ids, + ) + else: + oam_vf_pcie_data = {} + + # Include bridge/retimer devices (0x1500, 0x1501) + us_ds_retimer = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids={0x1500, 0x1501}, + ) + ubb_data = {**oam_pcie_data, **us_ds_retimer} + ubb_data_with_vf = {**ubb_data, **oam_vf_pcie_data} + # Type 0 Configuration Space Checks + self.check_pcie_status_errors(bdf_cfg_space_dict=ubb_data_with_vf) + # Check other capability structures + dev_ids = set( + list(pcie_input_data.exp_max_payload_size.keys()) + + list(pcie_input_data.exp_max_rd_req_size.keys()) + + list(pcie_input_data.exp_ten_bit_tag_req_en.keys()) + ) + for device_id_to_check in dev_ids: + cfg_space_filtered = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids={device_id_to_check}, + ) + self.check_pcie_exp_capability_structure_config( + cfg_space_filtered, + pcie_input_data.exp_max_payload_size.get(device_id_to_check), + pcie_input_data.exp_max_rd_req_size.get(device_id_to_check), + pcie_input_data.exp_ten_bit_tag_req_en.get(device_id_to_check), + ) + + # run with vfs for AERs and PCIe EXP errors + self.check_pcie_exp_capability_structure_errors(bdf_cfg_space_dict=ubb_data_with_vf) + self.check_ecap_aer_errors(bdf_cfg_space_dict=ubb_data_with_vf) + self.check_ecap_16gt_regs(bdf_cfg_space_dict=ubb_data) + self.check_ecap_sec_pci_regs(bdf_cfg_space_dict=ubb_data) + + if amd_device_ids: + self.device_consistancy_chk( + bdf_cfg_space_dict=ubb_data, + ) + else: + self._log_event( + category=EventCategory.RUNTIME, + description="No AMD GPU devices found, skipping device consistency check", + priority=EventPriority.INFO, + ) + + self.check_gpu_count(pcie_data, exp_gpu_count_override) + + return self.result diff --git a/nodescraper/plugins/inband/pcie/pcie_collector.py b/nodescraper/plugins/inband/pcie/pcie_collector.py new file mode 100755 index 00000000..d4c2108a --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_collector.py @@ -0,0 +1,690 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from enum import Enum +from typing import Dict, List, Optional, Set, Tuple, Union + +from pydantic import ValidationError + +from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact +from nodescraper.enums import ( + EventCategory, + EventPriority, + ExecutionStatus, + OSFamily, + SystemInteractionLevel, +) +from nodescraper.models import TaskResult +from nodescraper.utils import get_all_subclasses, get_exception_details + +from .pcie_data import ( + MAX_CAP_ID, + MAX_ECAP_ID, + CapabilityEnum, + ExtendedCapabilityEnum, + PcieCapStructure, + PcieCfgSpace, + PcieDataModel, + Type0Configuration, + Type1Configuration, +) + + +class PcieCollector(InBandDataCollector[PcieDataModel, None]): + """class for collection of PCIe data only supports Linux OS type. + + This class collects the PCIE config space using the lspci hex dump and then parses the hex dump to get the + PCIe configuration space for the GPUs in the system. If the system interaction level is set to STANDARD or higher, + then the entire pcie configuration space is collected for the GPUs in the system. If the system interaction level + is set to SURFACE then, only the first 64 bytes of the pcie configuration space is collected for the GPUs in the system. + + This class will collect important PCIe data from the system running the commands + - `lspci -vvv` : Verbose collection of PCIe data + - `lspci -vvvt`: Verbose tree view of PCIe data + - `lspci -PP`: Path view of PCIe data for the GPUs + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - `lspci -xxxx`: Hex view of PCIe data for the GPUs + - otherwise the following commands will be run without sudo: + - `lspci -x`: Hex view of PCIe data for the GPUs + - `lspci -d :` : Count the number of GPUs in the system with this command + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - The sudo lspci -xxxx command is used to collect the PCIe configuration space for the GPUs in the system + - otherwise the following commands will be run without sudo: + - The lspci -x command is used to collect the PCIe configuration space for the GPUs in the system + + """ + + SUPPORTED_OS_FAMILY: Set[OSFamily] = {OSFamily.LINUX} + + DATA_MODEL = PcieDataModel + + CMD_LSPCI_VERBOSE = "lspci -vvv" + CMD_LSPCI_VERBOSE_TREE = "lspci -vvvt" + CMD_LSPCI_PATH = "lspci -PP" + CMD_LSPCI_HEX_SUDO = "lspci -xxxx" + CMD_LSPCI_HEX = "lspci -x" + CMD_LSPCI_AMD_DEVICES = "lspci -d {vendor_id}: -nn" + CMD_LSPCI_PATH_DEVICE = "lspci -PP -d {vendor_id}:{dev_id}" + + def _detect_amd_device_ids(self) -> dict[str, list[str]]: + """Detect AMD GPU device IDs from the system using lspci. + + Returns: + dict[str, list[str]]: Dictionary with 'vendor_id', 'device_ids', and 'vf_device_ids' + """ + vendor_id_hex = format(self.system_info.vendorid_ep, "x") + result: dict[str, list[str]] = { + "vendor_id": [vendor_id_hex], + "device_ids": [], + "vf_device_ids": [], + } + + res = self._run_sut_cmd( + self.CMD_LSPCI_AMD_DEVICES.format(vendor_id=vendor_id_hex), + sudo=False, + log_artifact=False, + ) + if res.exit_code == 0 and res.stdout: + # Pattern: [vendor:device] + device_id_pattern = rf"\[{vendor_id_hex}:([0-9a-fA-F]{{4}})\]" + # Pattern to detect VF in description + vf_pattern = r"Virtual Function" + + for line in res.stdout.splitlines(): + matches = re.findall(device_id_pattern, line) + if matches: + device_id = matches[0].lower() + # Check if it's a VF + if re.search(vf_pattern, line, re.IGNORECASE): + if device_id not in result["vf_device_ids"]: + result["vf_device_ids"].append(device_id) + self.logger.info(f"Detected AMD VF device ID: {device_id}") + else: + if device_id not in result["device_ids"]: + result["device_ids"].append(device_id) + self.logger.info(f"Detected AMD device ID: {device_id}") + + self._log_event( + category=EventCategory.IO, + description="Detected AMD GPU device IDs from system", + data=result, + priority=EventPriority.INFO, + ) + + return result + + def show_lspci_verbose(self, sudo=True) -> Optional[str]: + """Show lspci with -vvv.""" + return self._run_os_cmd(self.CMD_LSPCI_VERBOSE, sudo=sudo) + + def show_lspci_verbose_tree(self, sudo=True) -> Optional[str]: + """Show lspci with -vvvt (verbose tree view).""" + return self._run_os_cmd(self.CMD_LSPCI_VERBOSE_TREE, sudo=sudo) + + def show_lspci_path(self, sudo=True) -> Optional[str]: + """Show lspci with -PP.""" + return self._run_os_cmd(self.CMD_LSPCI_PATH, sudo=sudo) + + def show_lspci_hex(self, bdf: Optional[str] = None, sudo=True) -> Optional[str]: + """Show lspci with -xxxx.""" + if sudo: + hex_arg = "-xxxx" + else: + # Sudo required for whole pcie configuration space + hex_arg = "-x" + + if bdf: + return self._run_os_cmd(f"lspci {hex_arg} -s {bdf}", sudo=sudo) + return self._run_os_cmd(f"lspci {hex_arg}", sudo=sudo) + + def _run_os_cmd( + self, command: str, sudo: bool = True, ignore_error: bool = False + ) -> Optional[str]: + """Run os command. Run as sudo by default. + + Args: + command (str): command to run on the OS + sudo (bool): run as sudo or not + ignore_error (bool): ignore error or not + Returns: + stdout: str + """ + cmd_ret = self._run_sut_cmd(command, sudo=sudo) + if ignore_error: + return cmd_ret.stdout + elif cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + return None + else: + return cmd_ret.stdout + + def _get_upstream_bdf_from_buspath( + self, + vendor_id: str, + dev_id: str, + upstream_steps_limit: Optional[int] = 0, + sudo=True, + ) -> Optional[Dict[str, List[str]]]: + """Get all the upstream BDFs for a vendor/device id. + + Parameters + ---------- + vendor_id : str + A pcie vendor id + dev_id : str + A pcie device id + upstream_steps_limit : Optional[int] + The limit on the number of upstream devices to collect, by default 0 + sudo : bool + Run the command as sudo or not, by default True + + Returns + ------- + Optional[List[str]] + A list of upstream BDFs or None on failure + """ + split_bdf_pos = 0 + + bus_path_all_gpus = self._run_os_cmd(f"lspci -PP -d {vendor_id}:{dev_id}", sudo=sudo) + if bus_path_all_gpus is None or bus_path_all_gpus == "": + self._log_event( + category=EventCategory.IO, + description="Failed to get bus path info for vendor/device ID.", + data={"vendor_id": vendor_id, "dev_id": dev_id}, + priority=EventPriority.INFO, + ) + return None + upstream_bdfs: Dict[str, List[str]] = {} + for bus_path in bus_path_all_gpus.splitlines(): + bus_path_list = (bus_path.split(" ")[split_bdf_pos]).split("/") + if upstream_steps_limit is not None and len(bus_path_list) < upstream_steps_limit + 1: + # We don't have enough upstream devices to collect + self._log_event( + category=EventCategory.RUNTIME, + description="Not enough upstream devices found.", + data={ + "bus_path": bus_path, + "upstream_steps_limit": upstream_steps_limit, + "bus_path_list": bus_path_list, + }, + priority=EventPriority.WARNING, + ) + bdf_str = bus_path_list[-1] + upstream_bdfs[bdf_str] = [] + # Flip the bus_path_list to get GPU first and then upstream devices + bus_path_list.reverse() + # Upstream + 1 to always include GPU and # of upstream devices + if upstream_steps_limit is None: + upstream_bdfs[bdf_str] = bus_path_list + else: + for bdf in range(min(len(bus_path_list), upstream_steps_limit + 1)): + upstream_bdfs[bdf_str].append(bus_path_list[bdf]) + + return upstream_bdfs + + def _get_gpu_cfg_space( + self, + vendor_id: str, + device_id: str, + upstream_steps_from_gpu: Optional[int] = 0, + sudo=True, + ) -> dict[str, PcieCfgSpace]: + """ + - Generates a nested dictionary with the PCIe configuration space for the bdfs corresponding to the vendor/device ID + - Populates the dict by reading cfg space through 'setpci' commands + + Args: + vendor_id (str): vendor ID (hex format) + device_id (str): device ID (hex format) + upstream_steps_from_gpu (Optional[int]): The number of upstream devices to collect the PCIe cfg space for, by default 0 + Returns: + all_bdf_cfg_space_dict: nested dictionary containing PCIe cfg space for all bdfs corresponding to the vendor/device ID + """ + if (vendor_id is None) or (device_id is None): + self._log_event( + category=EventCategory.IO, + description="System info is invalid Vendor ID or Device ID is None.", + data={"vendor_id": vendor_id, "dev_id": device_id}, + priority=EventPriority.ERROR, + ) + return {} + + bdf_list = self._get_upstream_bdf_from_buspath( + vendor_id, + device_id, + upstream_steps_limit=upstream_steps_from_gpu, + sudo=sudo, + ) + if bdf_list is None: + return {} + + all_bdf_cfg_space_dict = {} + for gpu_bdf_list in bdf_list.values(): + for bdf in gpu_bdf_list: + new_base_dict = self.get_cfg_by_bdf(bdf, sudo=sudo) + all_bdf_cfg_space_dict[bdf] = new_base_dict + return all_bdf_cfg_space_dict + + def parse_hex_dump(self, hex_dump: str) -> list[int]: + """Parse the hex dump.""" + + hex_dump = hex_dump.strip() + byte_list = [] + for line in hex_dump.splitlines(): + parts = line.split(":") + if len(parts) != 2: + continue # Skip malformed lines + if len(parts[1]) != 48: + continue # Unexpected number of bytes + byte_str = parts[1] + tokens = byte_str.strip().split() + for token in tokens: + byte = int(token, 16) + byte_list.append(byte) + + return byte_list + + def read_register(self, width: int, offset: int, config_data: List[int]): + """Read a register from the hex dump, width should be 1, 2, 4, or 8 bytes""" + register_value = 0 + for i in range(0, width >> 3): + register_value += config_data[offset + i] << (i * 8) + return register_value + + def extended_cap_finder( + self, + config_data: List[int], + cap_pointer: int, + cap_data: Optional[Dict[int, int]] = None, + ): + """Obtain capability structure by parsing the hex dump for capability pointers + + config_data : List[int] + A list of int's representing the hex dump from lspci -x or sudo lspci -xxxx + cap_pointer : int + The hex value of a Capability pointer or 0x34 for the first cap pointer + cap_data : Optional[dict[int, int]], optional + A dictionary of capability pointers, by default None + + returns + ------- + cap_data : Dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer use CapabilityEnum(cap_id) to get the Name + """ + if cap_data is None: + cap_data = {} + if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data): + # prevent an illegal access to the list + return cap_data + cap_id = config_data[cap_pointer] + (config_data[cap_pointer + 1] << 8) + if cap_id > MAX_ECAP_ID: + # Break if the cap_id is greater than the max extended cap id + self._log_event( + category=EventCategory.IO, + description=f"Invalid Capability ID detected {cap_id}", + priority=EventPriority.ERROR, + data={"cap_id": cap_id}, + ) + return {} + cap_data[cap_id] = cap_pointer + if cap_pointer + 3 >= len(config_data): + return cap_data + next_cap_pointer = (config_data[cap_pointer + 2] & 0xF0) >> 4 + next_cap_pointer += config_data[cap_pointer + 3] << 4 + if next_cap_pointer == 0: + return cap_data + else: + return self.extended_cap_finder(config_data, next_cap_pointer, cap_data) + + def cap_finder( + self, + config_data: List[int], + cap_pointer: int, + cap_data: Optional[Dict[int, int]] = None, + ): + """Obtain capability structure by parsing the hex dump for capability pointers + + Parameters + ---------- + config_data : List[int] + A list of int's representing the hex dump from lspci -xxxx + cap_pointer : int + The hex value of a Capability pointer or 0x34 for the first cap pointer + cap_data : Optional[Dict[int, int]], optional + A dictionary of capability pointers, by default None + + Returns + ------- + cap_data : Dict[int, int] + A list of extended apability pointers, key is the cap_id and value is the cap_pointer use ExtendedCapabilityEnum(cap_id) to get the Name + """ + if cap_data is None: + cap_data = {} + + if cap_pointer == 0x34: + # Special case for ths first cap pointer, this one doesn't have an associated cap_id so just move on + return self.cap_finder(config_data, config_data[0x34], cap_data) + if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data): + # prevent an illegal access to the list + return cap_data + cap_id = config_data[cap_pointer] + if cap_id > MAX_CAP_ID: + # Break if the cap_id is greater than the max cap id + self._log_event( + category=EventCategory.IO, + description=f"Invalid Capability ID detected {cap_id}", + priority=EventPriority.ERROR, + data={"cap_id": cap_id}, + ) + return {} + next_cap_pointer = config_data[cap_pointer + 1] + cap_data[cap_id] = cap_pointer + if next_cap_pointer == 0: + return cap_data + else: + return self.cap_finder(config_data, next_cap_pointer, cap_data) + + def get_cap_struct(self, id: Enum) -> Optional[type[PcieCapStructure]]: + for cap_struct in get_all_subclasses(PcieCapStructure): + if cap_struct.cap_id == id: + return cap_struct + return None + + def get_pcie_common_cfg( + self, + type_x_configuration: Union[type[Type0Configuration], type[Type1Configuration]], + config_data: List[int], + ) -> Union[Type0Configuration, Type1Configuration]: + """Get the Base PCIe configuration space from the hex dump items + + Parameters + ---------- + type_x_configuration : Union[type[Type0Configuration], type[Type1Configuration]] + Either Type0Configuration or Type1Configuration + config_data : List[int] + Config data from lspci -xxxx + + Returns + ------- + Union[Type0Configuration, Type1Configuration] + The complete model that was input + """ + register_data: Dict[str, int] = {} + type_x_obj = type_x_configuration() + for register_name, register_in in type_x_obj.iter_regs(): + register = register_in.model_copy() + register_data[register_name] = self.read_register( + register.width, register.offset, config_data + ) + type_x_obj.set_regs(register_data) + return type_x_obj + + def get_cap_cfg( + self, + cap_data: Dict[int, int], + config_data: List[int], + ) -> Union[ + Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure] + ]: + """Get the data from the capability structures + + Parameters + ---------- + cap_data : Dict[int,int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + config_data : List[int] + A list of ints representing the hex dump from lspci -xxxx + + Returns + ------- + Union[Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]] + Either a dict of CapabilityEnum to PcieCapStructure or ExtendedCapabilityEnum to PcieCapStructure + + """ + cap_structure: Dict[Enum, PcieCapStructure] = {} + for cap_id, cap_addr in cap_data.items(): + if cap_id == 0: + continue + if cap_addr >= 0x100: + cap_enum: Enum = ExtendedCapabilityEnum(cap_id) + else: + cap_enum = CapabilityEnum(cap_id) + cap_cls = self.get_cap_struct(cap_enum) + if cap_cls is None: + continue + cap_obj = cap_cls() # type: ignore[call-arg] + reg_data = {} + for register_name, register in cap_obj.iter_regs(): + reg_data[register_name] = self.read_register( + register.width, register.offset + cap_addr, config_data + ) + cap_obj.set_regs(reg_data) + cap_obj.offset = cap_addr + cap_structure[cap_enum] = cap_obj + + return cap_structure # type: ignore[return-value] + + def get_cfg_by_bdf(self, bdf: str, sudo=True) -> PcieCfgSpace: + """Will fill out a PcieCfgSpace object with the PCIe configuration space for a given BDF""" + hex_data_raw = self.show_lspci_hex(bdf, sudo=sudo) + if hex_data_raw is None: + self._log_event( + category=EventCategory.IO, + description="Failed to get hex data for BDF.", + data={"bdf": bdf}, + priority=EventPriority.ERROR, + ) + return PcieCfgSpace() + hex_data: List[int] = self.parse_hex_dump(hex_data_raw) + if len(hex_data) < 64: + # Expect at least 256 bytes of data, for the first 256 bytes of the PCIe config space + self._log_event( + category=EventCategory.IO, + description="Hex data is not the expected length", + data={"bdf": bdf, "length": len(hex_data)}, + priority=EventPriority.ERROR, + ) + return PcieCfgSpace() + cap_data, ecap_data = self.discover_capability_structure(hex_data) + return self.get_pcie_cfg(hex_data, cap_data, ecap_data) + + def get_pcie_cfg( + self, + config_data: List[int], + cap_data: Dict[int, int], + ecap_data: Dict[int, int], + ) -> PcieCfgSpace: + """Gets the pcie config space from a list of ints + + Parameters + ---------- + config_data : List[int] + A list of ints representing the hex dump from lspci -xxxx + cap_data : Dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + + Returns + ------- + PcieCfgSpace + A PcieCfgSpace object with the PCIe configuration + """ + type0 = self.get_pcie_common_cfg(Type0Configuration, config_data) + type1 = self.get_pcie_common_cfg(Type1Configuration, config_data) + cap = self.get_cap_cfg(cap_data, config_data) + ecap = self.get_cap_cfg(ecap_data, config_data) + return PcieCfgSpace( + type_0_configuration=type0, # type: ignore[arg-type] + type_1_configuration=type1, # type: ignore[arg-type] + capability_pointers=cap_data, # type: ignore[arg-type] + extended_capability_pointers=ecap_data, # type: ignore[arg-type] + cap_structure=cap, # type: ignore[arg-type] + ecap_structure=ecap, # type: ignore[arg-type] + ) + + def _log_pcie_artifacts( + self, + lspci_pp: Optional[str], + lspci_hex: Optional[str], + lspci_verbose_tree: Optional[str], + lspci_verbose: Optional[str], + ): + """Log the file artifacts for the PCIe data collector.""" + name_log_map = { + "lspci_hex.txt": lspci_hex, + "lspci_verbose_tree.txt": lspci_verbose_tree, + "lspci_verbose.txt": lspci_verbose, + "lspci_pp.txt": lspci_pp, + } + for name, data in name_log_map.items(): + if data is not None: + self.result.artifacts.append(TextFileArtifact(filename=name, contents=data)) + + def _get_pcie_data( + self, upstream_steps_to_collect: Optional[int] = None + ) -> Optional[PcieDataModel]: + """Will return all PCIe data in a PcieDataModel object. + + Returns + ------- + Optional[PcieDataModel] + The data in a PcieDataModel object or None on failure + """ + minimum_system_interaction_level_required_for_sudo = SystemInteractionLevel.INTERACTIVE + + try: + if ( + isinstance(self.system_interaction_level, SystemInteractionLevel) + and self.system_interaction_level + >= minimum_system_interaction_level_required_for_sudo + ): + use_sudo = True + else: + use_sudo = False + + if upstream_steps_to_collect is None: + upstream_steps_to_collect = None + + # Detect AMD device IDs dynamically from the system + detected_devices = self._detect_amd_device_ids() + vendor_id = ( + detected_devices["vendor_id"][0] + if detected_devices["vendor_id"] + else format(self.system_info.vendorid_ep, "x") + ) + device_ids = detected_devices["device_ids"] + vf_device_ids = detected_devices["vf_device_ids"] + + pcie_cfg_dict: Dict[str, PcieCfgSpace] = {} + vf_pcie_cfg_data: Dict[str, PcieCfgSpace] = {} + + # Collect PCIe config space for each detected device ID + for dev_id in device_ids: + cfg_space = self._get_gpu_cfg_space( + vendor_id=vendor_id, + device_id=dev_id, + upstream_steps_from_gpu=upstream_steps_to_collect, + sudo=use_sudo, + ) + if cfg_space: + pcie_cfg_dict.update(cfg_space) + + # Collect VF PCIe config space for each detected VF device ID + for dev_id_vf in vf_device_ids: + vf_cfg_space = self._get_gpu_cfg_space( + vendor_id=vendor_id, + device_id=dev_id_vf, + upstream_steps_from_gpu=0, + sudo=use_sudo, + ) + if vf_cfg_space: + vf_pcie_cfg_data.update(vf_cfg_space) + + lspci_hex = self.show_lspci_hex(sudo=use_sudo) + lspci_verbose = self.show_lspci_verbose(sudo=use_sudo) + lspci_verbose_tree = self.show_lspci_verbose_tree(sudo=use_sudo) + lspci_path = self.show_lspci_path(sudo=use_sudo) + self._log_pcie_artifacts( + lspci_pp=lspci_path, + lspci_hex=lspci_hex, + lspci_verbose_tree=lspci_verbose_tree, + lspci_verbose=lspci_verbose, + ) + pcie_data = PcieDataModel( + pcie_cfg_space=pcie_cfg_dict, + vf_pcie_cfg_space=vf_pcie_cfg_data, + ) + except ValidationError as e: + self._log_event( + category=EventCategory.OS, + description="Failed to build model for PCIe data", + data=get_exception_details(e), + priority=EventPriority.ERROR, + ) + self.result.status = ExecutionStatus.ERROR + return None + return pcie_data + + def discover_capability_structure( + self, hex_dump: List[int] + ) -> Tuple[Dict[int, int], Dict[int, int]]: + """Obtain the capability structure by parsing the hex dump for capability pointers + + Parameters + ---------- + hex_dump : List[int] + A list of ints from lspci -xxxx + + Returns + ------- + dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + """ + cap = self.cap_finder(hex_dump, 0x34) + ecap = self.extended_cap_finder(hex_dump, 0x100) + return cap, ecap + + def collect_data( + self, args=None, upstream_steps_to_collect: Optional[int] = None, **kwargs + ) -> Tuple[TaskResult, Optional[PcieDataModel]]: + """Read PCIe data. + + Args: + args: Optional collector arguments (not used) + upstream_steps_to_collect: Number of upstream devices to collect + **kwargs: Additional keyword arguments + + Returns: + Tuple[TaskResult, Optional[PcieDataModel]]: tuple containing the result of the task and the PCIe data if available + """ + pcie_data = self._get_pcie_data(upstream_steps_to_collect) + if pcie_data: + self._log_event( + category=EventCategory.IO, + description="PCIe Data read from GPUs", + data={"bdf_count": len(pcie_data.pcie_cfg_space.keys())}, + priority=EventPriority.INFO, + ) + return self.result, pcie_data diff --git a/nodescraper/plugins/inband/pcie/pcie_data.py b/nodescraper/plugins/inband/pcie/pcie_data.py new file mode 100644 index 00000000..77ea0e1c --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_data.py @@ -0,0 +1,2017 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from enum import Enum +from typing import ( + Annotated, + Any, + ClassVar, + Dict, + Generator, + List, + Optional, + TypeVar, + Union, +) + +from pydantic import ( + AfterValidator, + BaseModel, + SerializeAsAny, + field_serializer, + field_validator, +) + +from nodescraper.models import DataModel +from nodescraper.utils import apply_bit_mask_int + +AnyCap = TypeVar("AnyCap") + + +def validate_bdf(bdf: str) -> str: + """Validate the bus-device-function string format""" + if not isinstance(bdf, str): + raise ValueError("BDF must be a string") + # Shall only contain hex digits, `.`, `:`, and `-` + if not all(c in "0123456789abcdefABCDEF.-:" for c in bdf): + raise ValueError("BDF must only contain hex digits, '.', ':', and '-'") + # TODO: Could add more specific validation for the format, e.g., 00:00.0 + return bdf + + +BdfStr = Annotated[str, AfterValidator(validate_bdf)] + + +def field_hex_val_serializer(self, value: Optional[int], _info) -> Optional[str]: + if value is None: + return None + return str(hex(value)) + + +def field_hex_val_validator(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value, 16) + + +class CapabilityEnum(Enum): + """This enum holds the capability IDs for PCI Configuration Space""" + + BASE_REGISTER = 0x00 # Null Capability + PM = 0x01 # PCI Power Management Interface + AGP = 0x02 # AGP + VPD = 0x03 # VPD + SLOTID = 0x04 # Slot Identification + MSI = 0x05 # MSI + COMPACT_PCI_HS = 0x06 # CompactPCI Hot Swap + PCIX = 0x07 # PCI-X + HYPERTRANS = 0x08 # HyperTransport + VENDOR = 0x09 # Vendor-specific + DEBUG_PORT = 0x0A # Debug Port + COMPACT_PCI_CENTRAL = 0x0B # CompactPCI Central Resource Control + PCI_HP = 0x0C # PCI Hot Plug + PCI_BRIDGE = 0x0D # PCI Bridge Subsstem ID + AGP_8X = 0x0E # AGP 8x y + SECURE_DEV = 0x0F # Secure Device + PCIE_EXP = 0x10 # PCI Express + MSIX = 0x11 # MSI-X + SATA = 0x12 # Serial ATA Data/Index + AF = 0x13 # Advanced Features + EA = 0x14 # Enhanced Allocation . + FPB = 0x15 # Flattening Portal Bridge (FPB) + + +MAX_CAP_ID = max(cap_id.value for cap_id in CapabilityEnum) + + +class ExtendedCapabilityEnum(Enum): + """This enum holds the extended capability IDs for PCI Configuration Space""" + + NULL = 0x0000 # Null Capability + AER = 0x0001 # Advanced Error Reporting Extended + VCEC = 0x0002 # Virtual Channel Extended Capability + DSN = 0x0003 # Device Serial Number Extended Capability + PWR_BUDGET = 0x0004 # Power Budgeting Extended Capability + LNK_DCLR = 0x0005 # Root Complex Link Declaration Extended Capability + LNK_CEC = 0x0006 # Root Complex Internal Link Control Extended Capability + RCECOLL = 0x0007 # Root Complex Event Collector Endpoint Association Extended Capability + MFVC = 0x0008 # Multi-Function Virtual Channel Extended Capability + VC2 = 0x0009 # Virtual Channel Extended Capability + RCRB = 0x000A # RCRB Header Extended Capability + VNDR = 0x000B # Vendor-specific Extended Capability + CAC = 0x000C # Configuration Access Correlation Extended Capability + ACS = 0x000D # ACS Extended Capability + ARI = 0x000E # ARI Extended Capability (ARI) + ATS = 0x000F # ATS Extended Capability + SRIOV = 0x0010 # SR-IOV Extended Capability + MRIOV = 0x0011 # MR-IOV Extended Capability (MR-IOV) Must not implement. + MULTCAST = 0x0012 # Multicast Extended Capability + PAGE_REQ = 0x0013 # Page Request Extended Capability (PRI) + AMD = 0x0014 # Reserved for AMD + RBAR = 0x0015 # Resizable BAR Extended Capability + DPA = 0x0016 # Dynamic Power Allocation Extended Capability (DPA) + TPH = 0x0017 # TPH Requester Extended Capability + LTR = ( + 0x0018 # LTR Extended Capability . LTR is controlled using Function 0 which is never a VF. + ) + SPCI = 0x0019 # Secondary PCI Express Extended Capability + PMUX = 0x001A # PMUX Extended Capability . PMUX is controlled using Function 0 which is never a VF. + PASID = 0x001B # PASID Extended Capability + LN = 0x001C # LN Requester Extended Capability (LNR) + DPC = 0x001D # DPC Extended Capability. + L1PM = 0x001E # L1 PM Substates Extended Capability . L1 PM Substates is controlled using Function 0 which is never a VF. + PTM = 0x001F # Precision Time Management Extended Capability (PTM) + MPCIE = 0x0020 # PCI Express over M-PHY Extended Capability (M-PCIe) + FRS = 0x0021 # FRS Queueing Extended Capability + RTR = 0x0022 # Readiness Time Reporting Extended Capability + DVENDR = 0x0023 # Designated vendor-specific Extended Capability + VFBAR = 0x0024 # VF Resizable BAR Extended Capability + DLF = 0x0025 # Data Link Feature Extended Capability . + PL_16GT = 0x0026 # Physical Layer 16.0 GT/s Extended Capability + LM = 0x0027 # Lane Margining at the Receiver Extended Capability + HID = 0x0028 # Hierarchy ID Extended Capability + NPEM = 0x0029 # Native PCIe Enclosure Management Extended Capability (NPEM) + PL_32GT = 0x002A # Physical Layer 32.0 GT/s Extended Capability + ALT_PROTOCOL = 0x002B # Alternate Protocol Extended Capability + SFI = 0x002C # System Firmware Intermediary (SFI)Extended Capability + DOE = 0x2E # 0x2e Data Object Exchange + INT_DOE = 0x30 # 0x30 Integrity and Data Encryption + + +MAX_ECAP_ID = max(cap_id.value for cap_id in ExtendedCapabilityEnum) + + +class PcieBitField(BaseModel): + """Holds data about a bit field including bit_mask and description and a method to get its value""" + + bit_mask: int + desc: str + val: Optional[int] = None + + def set_val(self, reg_val: Optional[int]): + """This will apply the bitmask and shift the value to get the bit field value""" + if reg_val is None: + self.val = None + else: + self.val = apply_bit_mask_int(reg_val, self.bit_mask) + + def get_val(self) -> Optional[int]: + """Returns the value of the bit field""" + return self.val + + def apply_mask(self, reg_val) -> Optional[int]: + """This will apply the bitmask and shift the value to get the bit field value + Ex: reg_val = 0x1200, bit_mask = 0xFF00, then the value of the bit field is 0x1200 & 0xFF00 -> 0x1200 >> 8 -> 0x12 + """ + if reg_val is None: + return None + else: + return apply_bit_mask_int(reg_val, self.bit_mask) + + validate_val = field_validator("val", mode="before")(field_hex_val_validator) + serialize_val = field_serializer("val")(field_hex_val_serializer) + + +class PcieRegister(BaseModel): + """Holds data about a register including its position, width, value, bit fields and a method to get the value of a bit field + setpci_name is the name of the register in setpci output --dumpregs""" + + width: int + offset: int + val: Optional[int] = None + desc: str = "" + err: Optional[str] = None + + def iter_fields(self) -> Generator[tuple[str, PcieBitField], Any, None]: + """Iterator for bit fields in the register""" + for name, value in iter(self): + if isinstance(value, PcieBitField): + yield name, value + + @property + def bit_fields(self) -> dict[str, PcieBitField]: + """Get all the bit fields in the register""" + return {name: value for name, value in self.iter_fields()} + + # This will serialize the value of the register as hex + serialize_val = field_serializer("val")(field_hex_val_serializer) + + # This will validate the value of the register from hex to int + validate_val = field_validator("val", mode="before")(field_hex_val_validator) + + def __setattr__(self, name, value): + """When the value of the register is set, set all the bit fields in the register automatically + otherwise just set the value""" + if name == "val": + # set all .vals in all bitfields + for _, field in self.iter_fields(): + field.set_val(value) + super().__setattr__(name, value) + + +class PcieCapStructure(BaseModel): + """Holds the capability and extended capability info including the ID and description as well as + the registers that exists within that capability structure.""" + + cap_id: ClassVar[Enum] + desc: str + offset: int = 0 + extended: Optional[bool] = False + + def iter_regs(self) -> Generator[tuple[str, PcieRegister], Any, None]: + """Iterator for bit fields in the register""" + for name, value in iter(self): + if isinstance(value, PcieRegister): + yield name, value + + def set_regs(self, values: Dict[str, int]): + for name, value in iter(self): + if isinstance(value, PcieRegister): + value.val = values.get(name, None) + + def null_err_regs(self, filters: Optional[List[str]] = None): + """Set all registers to None, except those in the filters list""" + err_null = [] + for name, reg in self.iter_regs(): + if filters is not None: + if name in filters and (reg.val is None or reg.err is not None): + err_null.append(name) + elif filters is None: + if reg.val is None or reg.err is not None: + err_null.append(name) + return err_null + + +def cap_id_to_class( + cap_id: Union[CapabilityEnum, ExtendedCapabilityEnum], +) -> Optional[type[PcieCapStructure]]: + """Convert a generic PcieCapStructure to a Specific PcieCapStructure based on the cap_id + + Parameters + ---------- + cap_id : Union[CapabilityEnum, ExtendedCapabilityEnum] + A capability ID + + Returns + ------- + Optional[type[PcieCapStructure]] + A specific PcieCapStructure class or None if not found + """ + for cls in PcieCapStructure.__subclasses__(): + if cls.cap_id == cap_id: + return cls + return None + + +class CommandRegister(PcieRegister): + """Command Register in PCI Configuration Space""" + + offset: int = 0x04 + width: int = 16 + io_space_en: PcieBitField = PcieBitField(bit_mask=0x1, desc="I/O Space Enable") + mem_space_en: PcieBitField = PcieBitField(bit_mask=0x2, desc="Memory Space Enable") + bus_mstr_en: PcieBitField = PcieBitField(bit_mask=0x4, desc="Bus Master Enable") + spec_cyc_en: PcieBitField = PcieBitField(bit_mask=0x8, desc="Special Cycle Enable") + mem_wr_inval: PcieBitField = PcieBitField(bit_mask=0x10, desc="Memory Write and Invalidate") + vga_pal_snoop: PcieBitField = PcieBitField(bit_mask=0x20, desc="VGA Palette Snoop") + parity_err_res: PcieBitField = PcieBitField(bit_mask=0x40, desc="Parity Error Response") + idsel_step_wait_cyc_ctrl: PcieBitField = PcieBitField( + bit_mask=0x80, desc="IDSEL Stepping/Wait Cycle Control" + ) + serr_en: PcieBitField = PcieBitField(bit_mask=0x100, desc="SERR# Enable") + fast_b2b_trans_en: PcieBitField = PcieBitField( + bit_mask=0x200, desc="Fast Back-to-Back Transactions Enable" + ) + int_dis: PcieBitField = PcieBitField(bit_mask=0x400, desc="Interrupt Disable") + + +class StatusRegister(PcieRegister): + """Status Register in PCI Configuration Space""" + + offset: int = 0x06 + width: int = 16 + desc: str = "Status Register" + immed_readiness: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Immediate Readiness") + int_stat: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="Interrupt Status") + cap_list: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="Capabilities List") + sixty_six_mhz_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="66 MHz Capable") + fast_b2b_trans_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Capable" + ) + mstr_data_par_err: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Master Data Parity Error" + ) + devsel_timing: PcieBitField = PcieBitField(bit_mask=(0b11 << 9), desc="DEVSEL Timing") + signaled_target_abort: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Signaled Target Abort" + ) + rcvd_target_abort: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Received Target Abort") + rcvd_mstr_abort: PcieBitField = PcieBitField(bit_mask=(1 << 13), desc="Received Master Abort") + signaled_sys_err: PcieBitField = PcieBitField(bit_mask=(1 << 14), desc="Signaled System Error") + det_parity_err: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Detected Parity Error") + + +class Type01Common(PcieCapStructure): + """Common fields for Type 01""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 0/1 Common Configuration Space" + vendor_id: PcieRegister = PcieRegister(width=16, offset=0x00) + device_id: PcieRegister = PcieRegister(width=16, offset=0x02) + command: CommandRegister = CommandRegister() + status: StatusRegister = StatusRegister() + revision_id: PcieRegister = PcieRegister(width=8, offset=0x08) + prog_if: PcieRegister = PcieRegister(width=8, offset=0x09) + subclass: PcieRegister = PcieRegister(width=8, offset=0x0A) + class_code: PcieRegister = PcieRegister(width=8, offset=0x0B) + cache_line_size: PcieRegister = PcieRegister(width=8, offset=0x0C) + latency_timer: PcieRegister = PcieRegister(width=8, offset=0x0D) + header_type: PcieRegister = PcieRegister(width=8, offset=0x0E) + bist: PcieRegister = PcieRegister(width=8, offset=0x0F) + + +class Type0Configuration(Type01Common): + """Type 0 Specific Common Configuration Space""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 0 Specific Common Configuration Space" + base_address_0: PcieRegister = PcieRegister( + offset=0x10, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h) / 7.5.1.3.1 Type 1 Base Address Registers (Offset 10h-14h)", + ) + base_address_1: PcieRegister = PcieRegister( + offset=0x14, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h) / 7.5.1.3.1 Type 1 Base Address Registers (Offset 10h-14h)", + ) + base_address_2: PcieRegister = PcieRegister( + offset=0x18, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_3: PcieRegister = PcieRegister( + offset=0x1C, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_4: PcieRegister = PcieRegister( + offset=0x20, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_5: PcieRegister = PcieRegister( + offset=0x24, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + cardbus_cis: PcieRegister = PcieRegister( + offset=0x28, + width=32, + desc="7.5.1.2.2 Cardbus CIS Pointer Register (Offset 28h)", + ) + subsystem_vendor_id: PcieRegister = PcieRegister( + offset=0x2C, + width=16, + desc="7.5.1.2.3 Subsystem Vendor ID Register/Subsystem ID Register (Offset 2Ch/2Eh)", + ) + subsystem_id: PcieRegister = PcieRegister( + offset=0x2E, + width=16, + desc="7.5.1.2.3 Subsystem Vendor ID Register/Subsystem ID Register (Offset 2Ch/2Eh)", + ) + rom_address: PcieRegister = PcieRegister( + offset=0x30, + width=32, + desc="7.5.1.2.4 Expansion ROM Base Address Register (Offset 30h)", + ) + min_gnt: PcieRegister = PcieRegister( + offset=0x3E, + width=8, + desc="7.5.1.2.5 Min_Gnt Register/Max_Lat Register (Offset 3Eh/3Fh)", + ) + max_lat: PcieRegister = PcieRegister( + offset=0x3F, + width=8, + desc="7.5.1.2.5 Min_Gnt Register/Max_Lat Register (Offset 3Eh/3Fh)", + ) + + +class SecStatusRegister(PcieRegister): + """Sec Status reg for Type 1""" + + offset: int = 0x1E + width: int = 16 + desc: str = "Secondary Status Register" + sixty_six_mhz_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="66 MHz Capable") + fast_b2b_trans_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Capable" + ) + mstr_data_par_err: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Master Data Parity Error" + ) + devsel_timing: PcieBitField = PcieBitField(bit_mask=(0b11 << 9), desc="DEVSEL Timing") + signaled_target_abort: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Signaled Target Abort" + ) + rcvd_target_abort: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Received Target Abort") + rcvd_mstr_abort: PcieBitField = PcieBitField(bit_mask=(1 << 13), desc="Received Master Abort") + rcvd_sys_err: PcieBitField = PcieBitField(bit_mask=(1 << 14), desc="Received System Error") + det_parity_err: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Detected Parity Error") + + +class BridgeControlRegister(PcieRegister): + """Bridge controller register Specific to Type 1""" + + offset: int = 0x3E + width: int = 16 + desc: str = "7.5.1.3.13 Bridge Control Register (Offset 3Eh)" + parity_err_res_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Parity Error Response Enable" + ) + serr_en: PcieBitField = PcieBitField(bit_mask=(1 << 1), desc="SERR# Enable") + isa_en: PcieBitField = PcieBitField(bit_mask=(1 << 2), desc="ISA Enable") + vga_en: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="VGA Enable") + vga_16_bit_dec: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="VGA 16-bit Decode") + mstr_abort_mode: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Master Abort Mode") + sec_bus_rst: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Secondary Bus Reset") + fast_b2b_trans_en: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Enable" + ) + primary_discard_timer: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Primary Discard Timer" + ) + sec_discard_timer: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Secondary Discard Timer" + ) + discard_timer_stat: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="Discard Timer Status") + discard_timer_serr_en: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Discard Timer SERR# Enable" + ) + + +class Type1Configuration(Type01Common): + """Type 1 Specific Common Configuration Space""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 1 Specific Common Configuration Space" + PRIMARY_BUS: PcieRegister = PcieRegister( + offset=0x18, width=8, desc="7.5.1.3.2 Primary Bus Number Register (Offset 18h)" + ) + SECONDARY_BUS: PcieRegister = PcieRegister( + offset=0x19, + width=8, + desc="7.5.1.3.3 Secondary Bus Number Register (Offset 19h)", + ) + SUBORDINATE_BUS: PcieRegister = PcieRegister( + offset=0x1A, + width=8, + desc="7.5.1.3.4 Subordinate Bus Number Register (Offset 1Ah)", + ) + SEC_LATENCY_TIMER: PcieRegister = PcieRegister( + offset=0x1B, width=8, desc="7.5.1.3.5 Secondary Latency Timer (Offset 1Bh)" + ) + IO_BASE: PcieRegister = PcieRegister( + offset=0x1C, + width=8, + desc="7.5.1.3.6 I/O Base/I/O Limit Registers(Offset 1Ch/1Dh)", + ) + IO_LIMIT: PcieRegister = PcieRegister( + offset=0x1D, + width=8, + desc="7.5.1.3.6 I/O Base/I/O Limit Registers(Offset 1Ch/1Dh)", + ) + MEMORY_BASE: PcieRegister = PcieRegister( + offset=0x20, + width=16, + desc="7.5.1.3.8 Memory Base Register/Memory Limit Register(Offset 20h/22h)", + ) + MEMORY_LIMIT: PcieRegister = PcieRegister( + offset=0x22, + width=16, + desc="7.5.1.3.8 Memory Base Register/Memory Limit Register(Offset 20h/22h)", + ) + PREF_MEMORY_BASE: PcieRegister = PcieRegister( + offset=0x24, + width=16, + desc="7.5.1.3.9 Prefetchable Memory Base/Prefetchable Memory Limit Registers (Offset 24h/26h)", + ) + PREF_MEMORY_LIMIT: PcieRegister = PcieRegister( + offset=0x26, + width=16, + desc="7.5.1.3.9 Prefetchable Memory Base/Prefetchable Memory Limit Registers (Offset 24h/26h)", + ) + PREF_BASE_UPPER32: PcieRegister = PcieRegister( + offset=0x28, + width=32, + desc="7.5.1.3.10 Prefetchable Base Upper 32 Bits/Prefetchable Limit Upper 32 Bits Registers (Offset 28h/2Ch)", + ) + PREF_LIMIT_UPPER32: PcieRegister = PcieRegister( + offset=0x2C, + width=32, + desc="7.5.1.3.10 Prefetchable Base Upper 32 Bits/Prefetchable Limit Upper 32 Bits Registers (Offset 28h/2Ch)", + ) + IO_BASE_UPPER16: PcieRegister = PcieRegister( + offset=0x30, + width=16, + desc="7.5.1.3.11 I/O Base Upper 16 Bits/I/O Limit Upper 16 Bits Registers (Offset 30h/32h)", + ) + IO_LIMIT_UPPER16: PcieRegister = PcieRegister( + offset=0x32, + width=16, + desc="7.5.1.3.11 I/O Base Upper 16 Bits/I/O Limit Upper 16 Bits Registers (Offset 30h/32h)", + ) + BRIDGE_ROM_ADDRESS: PcieRegister = PcieRegister( + offset=0x38, + width=32, + desc="7.5.1.3.12 Expansion ROM Base Address Register (Offset 38h)", + ) + + +class CapPm(PcieCapStructure): + """Capability Structure for Power Management""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PM + desc: str = "PCI Power Management Interface (9.6 SR-IOV Power Management)" + + +class CapAgp(PcieCapStructure): + """Capability Structure for Accelerated Graphics Port""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AGP + desc: str = "" + + +class CapVpd(PcieCapStructure): + """Capability Structure for Virtual Product Data""" + + cap_id: ClassVar[Enum] = CapabilityEnum.VPD + desc: str = "VPD (9.3.6.1 VPD Capability)" + + +class CapSlotid(PcieCapStructure): + """Capability Structure for Slot Identification""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SLOTID + desc: str = "Slot Identification" + + +class CapMsi(PcieCapStructure): + """Capability Structure for Message Signaled Interrupts""" + + cap_id: ClassVar[Enum] = CapabilityEnum.MSI + desc: str = "7.7.1 MSI Capability Structures" + + +class CapCompatHotSwp(PcieCapStructure): + """Cap for CompactPCI Hot Swap""" + + cap_id: ClassVar[Enum] = CapabilityEnum.COMPACT_PCI_HS + desc: str = "CompactPCI Hot Swap" + + +class CapPcix(PcieCapStructure): + """Cap for PCI Extensions""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCIX + desc: str = "PCI-X" + + +class CapHt(PcieCapStructure): + """HyperTransport Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.HYPERTRANS + desc: str = "HyperTransport" + + +class CapVndr(PcieCapStructure): + """Vendor Specific Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.VENDOR + desc: str = "7.9.4 Vendor-Specific Capability" + + +class CapDbg(PcieCapStructure): + """Capability for Debug Port""" + + cap_id: ClassVar[Enum] = CapabilityEnum.DEBUG_PORT + desc: str = "Debug Port" + + +class CapCompatPcieCentral(PcieCapStructure): + """Capability for CompactPCI Central Resource Control""" + + cap_id: ClassVar[Enum] = CapabilityEnum.COMPACT_PCI_CENTRAL + desc: str = "CompactPCI Central Resource Control" + + +class CapHotplug(PcieCapStructure): + """Capability for PCI Hot Plug""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCI_HP + desc: str = "PCI Hot Plug" + + +class CapPciBridge(PcieCapStructure): + """Capability for PCI Bridge Subsystem ID""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCI_BRIDGE + desc: str = "7.9.24 Subsystem ID and Sybsystem Vendor ID Capability" + + +class CapEnhAgp(PcieCapStructure): + """Enhanced Accelerated Graphics Port (AGP) interface supporting 8x data rate.""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AGP + desc: str = "AGP 8x" + + +class CapSecure(PcieCapStructure): + """Secure Device Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SECURE_DEV + desc: str = "Secure Device" + + +class PcieCapListReg(PcieRegister): + offset: int = 0x00 + width: int = 16 + cap_id_desc: PcieBitField = PcieBitField(bit_mask=0x00FF, desc="Capability ID") + nxt_cap_ptr: PcieBitField = PcieBitField(bit_mask=0xFF00, desc="Next Capability Pointer") + + +class DevCtrlRegister(PcieRegister): + offset: int = 0x08 + width: int = 16 + desc: str = "7.5.3.4 Device Control Register (Offset 08h)" + corr_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Correctable Error Enable" + ) + non_fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-fatal Error Reporting Enable" + ) + fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Fatal Error Reporting Enable" + ) + ur_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Unsupported Request Reporting Enable" + ) + en_relaxed_order: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="Enable Relaxed Ordering") + mps: PcieBitField = PcieBitField(bit_mask=(0x7 << 5), desc="Max_Payload_Size") + ext_tag_field_en: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Extended Tag Field Enable" + ) + phantom_func_en: PcieBitField = PcieBitField(bit_mask=(1 << 9), desc="Phantom Functions Enable") + aux_pwr_pm_en: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="Aux Power PM Enable") + en_no_snoop: PcieBitField = PcieBitField(bit_mask=(1 << 11), desc="Enable No Snoop") + max_rd_req_size: PcieBitField = PcieBitField(bit_mask=(0x7 << 12), desc="Max_Read_Request_Size") + bridge_cfg_retry_en_init_func_lvl_rst: PcieBitField = PcieBitField( + bit_mask=(1 << 15), + desc="Bridge Configuration Retry Enable / Initiate Function Level Reset", + ) + + +class DevStatRegister(PcieRegister): + offset: int = 0x0A + width: int = 16 + desc: str = "Device Status Register" + corr_err_det: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Correctable Error Detected") + non_fatal_err_det: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-Fatal Error Detected" + ) + fatal_err_det: PcieBitField = PcieBitField(bit_mask=(1 << 2), desc="Fatal Error Detected") + ur_det: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="Unsupported Request Detected") + aux_pwr_det: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="AUX Power Detected") + trans_pending: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Transactions Pending") + emer_pwr_reduction_det: PcieBitField = PcieBitField( + bit_mask=(1 << 6), desc="Emergency Power Reduction Detected" + ) + + +class LinkCapRegister(PcieRegister): + offset: int = 0x0C + width: int = 32 + desc: str = "7.5.3.6 Link Capabilities Register (Offset 0Ch)" + max_lnk_speed: PcieBitField = PcieBitField(bit_mask=(0xF << 0), desc="Max Link Speed") + max_lnk_width: PcieBitField = PcieBitField(bit_mask=(0x3F << 4), desc="Maximum Link Width") + aspm_support: PcieBitField = PcieBitField(bit_mask=(0x3 << 10), desc="ASPM Support") + l0s_exit_lat: PcieBitField = PcieBitField(bit_mask=(0x7 << 12), desc="L0s Exit Latency") + l1_exit_lat: PcieBitField = PcieBitField(bit_mask=(0x7 << 15), desc="L1 Exit Latency") + clk_pwr_mgmt: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Clock Power Management") + surprise_dn_err_report_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 19), desc="Surprise Down Error Reporting Capable" + ) + dll_lnk_active_report_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Data Link Layer Link Active Reporting Capable" + ) + lnk_bw_notif_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 21), desc="Link Bandwidth Notification Capability" + ) + aspm_optionality_comp: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="ASPM Optionality Compliance" + ) + port_num: PcieBitField = PcieBitField(bit_mask=(0xFF << 24), desc="Port Number") + + +class LinkStatRegister(PcieRegister): + """Link stat for Type 1""" + + offset: int = 0x12 + width: int = 16 + desc: str = "Link Status Register" + curr_lnk_speed: PcieBitField = PcieBitField(bit_mask=(0b1111 << 0), desc="Current Link Speed") + neg_lnk_width: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 4), desc="Negotiated Link Width" + ) + lnk_training: PcieBitField = PcieBitField(bit_mask=(1 << 11), desc="Link Training") + slot_clk_cfg: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Slot Clock Configuration") + dll_lnk_active: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Data Link Layer Link Active" + ) + lnk_bw_mgmt_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Link Bandwidth Management Status" + ) + lnk_auto_bw_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Link Autonomous Bandwidth Status" + ) + + +class DevCtrl2Register(PcieRegister): + offset: int = 0x28 + width: int = 16 + desc: str = "7.5.3.16 Device Control 2 Register (Offset 28h)" + completion_timeout_val: PcieBitField = PcieBitField( + bit_mask=(0xF << 0), desc="Completion Timeout Value" + ) + completion_timeout_dis: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Completion Timeout Disable" + ) + ari_forward_en: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="ARI Forwarding Enable") + atomic_op_req_en: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="AtomicOp Request Enable") + atomic_op_egress_blk: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="AtomicOp Egress Blocking" + ) + ido_req_en: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="IDO Request Enable") + ido_completion_en: PcieBitField = PcieBitField(bit_mask=(1 << 9), desc="IDO Completion Enable") + ltr_mechanism_en: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="LTR Mechanism Enable") + emergency_pwr_reduction_en: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Emergency Power Reduction Enable" + ) + ten_bit_tag_req_en: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="10-bit Tag Request Enable" + ) + obff_en: PcieBitField = PcieBitField(bit_mask=(0x3 << 13), desc="OBFF Enable") + end_end_tlp_prefix_blk: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="End-End TLP Prefix Blocking" + ) + + +class LinkCap2Register(PcieRegister): + """Link cap 2 for Type 1""" + + offset: int = 0x2C + width: int = 32 + desc: str = "7.5.3.18 Link Capabilities 2 Register (Offset 2Ch)" + supported_lnk_speed_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 1), desc="Supported Link Speeds Vector" + ) + xlnk_supported: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="Crosslink Supported") + lower_skp_os_gen_supported_speeds_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 9), desc="Lower SKP OS Generation Supported Speeds Vector" + ) + lower_skip_os_rec_supported_speeds_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 16), desc="Lower SKP OS Reception Supported Speeds Vector" + ) + retimer_prsnc_det_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="Retimer Presence Detect Supported" + ) + two_retimers_prsnc_det_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="Two Retimers Presence Detect Supported" + ) + drs_supported: PcieBitField = PcieBitField(bit_mask=(1 << 31), desc="DRS Supported") + + +class PcieExp(PcieCapStructure): + """PCIE Express Capability Structure 7.5.3 PCI Express Capability Structure + + This structure allows identification of a PCI Express device Function + and indicates support for new PCI Express features. + """ + + cap_id: ClassVar[Enum] = CapabilityEnum.PCIE_EXP + desc: str = "7.5.3 PCI Express Capability Structure" + cap_list: PcieCapListReg = PcieCapListReg() + pcie_cap_reg: PcieRegister = PcieRegister( + offset=2, + width=16, + desc="7.5.3.2 PCI Express Capabilities Register (Offset 02h)", + ) + dev_cap_reg: PcieRegister = PcieRegister( + offset=0x4, width=32, desc="7.5.3.3 Device Capabilities Register (Offset 04h)" + ) + dev_ctrl_reg: DevCtrlRegister = DevCtrlRegister() + dev_stat_reg: DevStatRegister = DevStatRegister() + lnk_cap_reg: LinkCapRegister = LinkCapRegister() + lnk_ctrl_reg: PcieRegister = PcieRegister( + offset=0x10, width=16, desc="7.5.3.7 Link Control Register (Offset 10h)" + ) + lnk_stat_reg: LinkStatRegister = LinkStatRegister() + dev_ctrl_2_reg: DevCtrl2Register = DevCtrl2Register() + lnk_cap_2_reg: LinkCap2Register = LinkCap2Register() + + +class CapMSIX(PcieCapStructure): + """Capability Structure for MSI-X""" + + cap_id: ClassVar[Enum] = CapabilityEnum.MSIX + offset: int = 0x00 + desc: str = "7.7.2 MSI-X Capability and Table Structure" + + +class CapSATA(PcieCapStructure): + """Cap for Serial ATA Data/Index Configuration""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SATA + offset: int = 0x00 + desc: str = "Serial ATA Data/Index Configuration" + + +class CapAF(PcieCapStructure): + """Capability for Advanced Features""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AF + offset: int = 0x00 + desc: str = "7.9.22 Conventional PCI Advanced Features Capability (AF)" + + +class CapEA(PcieCapStructure): + """Capability for Enhanced Allocation""" + + cap_id: ClassVar[Enum] = CapabilityEnum.EA + offset: int = 0x00 + desc: str = "7.8.5 Enhanced Allocation Capability Structure (EA)" + + +class AerEcapHdr(PcieRegister): + """Capability for Advanced Error Reporting""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.8.4.1 Advanced Error Reporting Extended Capability Header (Offset 00h)" + pcie_eacp_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class UncorrErrStatReg(PcieRegister): + """AER register for Uncorrectable Error Status Register""" + + offset: int = 0x04 + width: int = 32 + desc: str = "Uncorrectable Error Status Register" + dlnk_protocol_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Status" + ) + surprise_dn_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Status" + ) + poisoned_tlp_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Poisoned TLP Received") + fc_proto_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Status" + ) + cpl_timeout_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Status" + ) + ca_stat: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Status") + unexp_cpl_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Status" + ) + rx_overflow_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 17), desc="Receiver Overflow Status" + ) + malformed_tlp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Malformed TLP Status") + ecrc_err_stat: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Status") + ur_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Status" + ) + acs_violation_stat: PcieBitField = PcieBitField(bit_mask=(1 << 21), desc="ACS Violation Status") + uncorr_int_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Status" + ) + mc_blocked_tlp_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="MC Blocked TLP Status" + ) + atomicop_egress_blk_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Status" + ) + tlp_prefix_blk_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Status" + ) + poisoned_tlp_egress_blk_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Status" + ) + + +class UncorrErrMaskReg(PcieRegister): + """AER register for Uncorrectable Error Mask Register""" + + offset: int = 0x08 + width: int = 32 + desc: str = "7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)" + dlnk_protocol_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Mask" + ) + surprise_dn_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Mask" + ) + poisoned_tlp_rcvd_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Poisoned TLP Received Mask" + ) + fc_proto_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Mask" + ) + cpl_timeout_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Mask" + ) + ca_mask: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Mask") + unexp_cpl_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Mask" + ) + rx_overflow_mask: PcieBitField = PcieBitField(bit_mask=(1 << 17), desc="Receiver Overflow Mask") + malformed_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Malformed TLP Mask") + ecrc_err_mask: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Mask") + ur_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Mask" + ) + acs_violation_mask: PcieBitField = PcieBitField(bit_mask=(1 << 21), desc="ACS Violation Mask") + uncorr_int_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Mask" + ) + mc_blocked_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 23), desc="MC Blocked TLP Mask") + atomicop_egress_blk_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Mask" + ) + tlp_prefix_blk_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Mask" + ) + poisoned_tlp_egress_blk_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Mask" + ) + + +class UncorrErrSevReg(PcieRegister): + """AER register for Uncorrectable Error Severity Register""" + + offset: int = 0x0C + width: int = 32 + desc: str = "7.8.4.4 Uncorrectable Error Severity Register (Offset 0Ch)" + dlnk_protocol_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Severity" + ) + surprise_dn_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Severity" + ) + poisoned_tlp_rcvd_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Poisoned TLP Received Severity" + ) + fc_proto_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Severity" + ) + cpl_timeout_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Error Severity" + ) + ca_sev: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Error Severity") + unexp_cpl_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Error Severity" + ) + rx_overflow_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 17), desc="Receiver Overflow Severity" + ) + malformed_tlp_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 18), desc="Malformed TLP Severity" + ) + ecrc_err_sev: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Severity") + ur_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Severity" + ) + acs_violation_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 21), desc="ACS Violation Severity" + ) + uncorr_int_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Severity" + ) + mc_blocked_tlp_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="MC Blocked TLP Severity" + ) + atomicop_egress_blk_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Severity" + ) + tlp_prefix_blk_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Severity" + ) + poisoned_tlp_egress_blk_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Severity" + ) + + +class CorrErrStatReg(PcieRegister): + """AER register for Correctable Error Status Register""" + + offset: int = 0x10 + width: int = 32 + desc: str = "Correctable Error Status Register" + rx_err_stat: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Receiver Error Status") + bad_tlp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Bad TLP Status") + bad_dllp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="Bad DLLP Status") + replay_num_rollover_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="REPLAY_NUM Rollover Status" + ) + replay_timer_timeout_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Replay Timer Timeout Status" + ) + advisory_non_fatal_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Advisory Non-Fatal Error Status" + ) + corrected_int_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Corrected Internal Error Status" + ) + hdr_log_overflow_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Header Log Overflow Status" + ) + + +class CorrErrMaskReg(PcieRegister): + """AER register for Correctable Error Mask Register""" + + offset: int = 0x14 + width: int = 32 + desc: str = "7.8.4.6 Correctable Error Mask Register (Offset 14h)" + rx_err_mask: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Receiver Error Mask") + bad_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Bad TLP Mask") + bad_dllp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="Bad DLLP Mask") + replay_num_rollover_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="REPLAY_NUM Rollover Mask" + ) + replay_timer_timeout_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Replay Timer Timeout Mask" + ) + advisory_non_fatal_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Advisory Non-Fatal Error Mask" + ) + corrected_int_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Corrected Internal Error Mask" + ) + hdr_log_overflow_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Header Log Overflow Mask" + ) + + +class AerCapCtrlReg(PcieRegister): + """AER register for Advanced Error Capabilities and Control Register""" + + offset: int = 0x18 + width: int = 32 + desc: str = "7.8.4.7 Advanced Error Capabilities and Control Register (Offset 18h)" + fst_err_ptr: PcieBitField = PcieBitField(bit_mask=(0x1F), desc="First Error Pointer") + ecrc_gen_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="ECRC Generation Capable") + ecrc_gen_en: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="ECRC Generation Enable") + ecrc_chk_cap: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="ECRC Check Capable") + ecrc_chk_en: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="ECRC Check Enable") + multi_hdr_rec_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Multiple Header Recording Capable" + ) + multi_hdr_rec_en: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="Multiple Header Recording Enable" + ) + tlp_prefix_log_prsnt: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="TLP Prefix Log Present" + ) + cpl_timeout_prefix_hdr_log_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Completion Timeout Prefix/Header Log Capable" + ) + + +class RootErrCmdReg(PcieRegister): + """AER register for Root Error Command Register""" + + offset: int = 0x2C + width: int = 32 + desc: str = "7.8.4.9 Root Error Command Register (Offset 2Ch)" + corr_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Correctable Error Reporting Enable" + ) + non_fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-Fatal Error Reporting Enable" + ) + fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Fatal Error Reporting Enable" + ) + + +class RootErrStatReg(PcieRegister): + """AER register for Root Error Status Register""" + + offset: int = 0x30 + width: int = 32 + desc: str = "Root Error Status Register" + err_cor_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="ERR_COR Received") + multi_err_cor_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Multiple ERR_COR Received" + ) + err_fatal_nonfatal_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="ERR_FATAL/NONFATAL Received" + ) + multi_err_fatal_nonfatal_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Multiple ERR_FATAL/NONFATAL Received" + ) + fst_uncorr_fatal: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="First Uncorrectable Fatal" + ) + non_fatal_err_msg_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Non-Fatal Error Messages Received" + ) + fatal_err_msg_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 6), desc="Fatal Error Messages Received" + ) + err_cor_subclass: PcieBitField = PcieBitField(bit_mask=(0x3 << 7), desc="ERR_COR Subclass") + adv_err_int_msg_num: PcieBitField = PcieBitField( + bit_mask=(0x1F << 27), desc="Advanced Error Interrupt Message Number" + ) + + +class ErrSrcIdReg(PcieRegister): + """AER register for Error Source Identification Register""" + + offset: int = 0x34 + width: int = 32 + desc: str = "7.8.4.11 Error Source Identification Register (Offset 34h)" + err_cor_src_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="ERR_COR Source Identification" + ) + err_fatal_nonfatal_src_id: PcieBitField = PcieBitField( + bit_mask=0xFFFF0000, desc="ERR_FATAL/NONFATAL Source Identification" + ) + + +class ECapAer(PcieCapStructure): + """Extended Capability for Advanced Error Reporting""" + + extended: Optional[bool] = True + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.AER + offset: int = 0x00 + desc: str = "7.8.4 Advanced Error Reporting Extended Capability" + aer_ecap: AerEcapHdr = AerEcapHdr() + uncorr_err_stat: UncorrErrStatReg = UncorrErrStatReg() + uncorr_err_mask: UncorrErrMaskReg = UncorrErrMaskReg() + uncorr_err_sev: UncorrErrSevReg = UncorrErrSevReg() + corr_err_stat: CorrErrStatReg = CorrErrStatReg() + corr_err_mask: CorrErrMaskReg = CorrErrMaskReg() + aer_cap_ctrl: AerCapCtrlReg = AerCapCtrlReg() + root_err_cmd: RootErrCmdReg = RootErrCmdReg() + root_err_stat: RootErrStatReg = RootErrStatReg() + err_src_id: ErrSrcIdReg = ErrSrcIdReg() + + +class ECapVc(PcieCapStructure): + """Extended Capability for Virtual Channel""" + + extended: Optional[bool] = True + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VCEC + offset: int = 0x00 + desc: str = "7.9.1 Virtual Channel Extended Capability" + + +class ECapDsn(PcieCapStructure): + """Extended Capability for Device Serial Number""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DSN + offset: int = 0x00 + desc: str = "7.9.3 Device Serial Number Extended Capability" + + +class ECapPb(PcieCapStructure): + """Extended Capability for Power Budgeting""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PWR_BUDGET + offset: int = 0x00 + desc: str = "7.8.1 Power Budgeting Extended Capability" + + +class ECapRclink(PcieCapStructure): + """Extended Capability for Root Complex Link Declaration""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LNK_DCLR + offset: int = 0x00 + desc: str = "7.9.8.1 Root Complex Link Declaration Extended Capability Header (Offset 00h)" + + +class ECapRcilink(PcieCapStructure): + """Extended Capability for Root Complex Internal Link Control""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LNK_CEC + offset: int = 0x00 + desc: str = "7.9.9 Root Complex Internal Link Control Extended Capability" + + +class ECapRcecoll(PcieCapStructure): + """Extended Capability for Root Complex Event Collector Endpoint Association""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RCECOLL + offset: int = 0x00 + desc: str = ( + "7.9.10 Root Complex Event Collector Endpoint Association Extended Capability (Dell)" + ) + + +class ECapMfvc(PcieCapStructure): + """Extended Capability for Multi-Function Virtual Channel""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MFVC + offset: int = 0x00 + desc: str = "7.9.2 Multi-Function Virtual Channel Extended Capability" + + +class ECapVc2(PcieCapStructure): + """Extended Capability for Virtual Channel 2""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VC2 + offset: int = 0x00 + desc: str = "7.9.1 Virtual Channel Extended Capability" + + +class ECapRcrb(PcieCapStructure): + """Extended Capability for RCRB Header""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RCRB + offset: int = 0x00 + desc: str = "7.9.7 RCRB Header Extended Capability" + + +class ECapVndr(PcieCapStructure): + """Extended Capability for Vendor-Specific""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VNDR + offset: int = 0x00 + desc: str = "7.9.5 Vendor-Specific Extended Capability" + + +class ECapCac(PcieCapStructure): + """Extended Capability for Configuration Access Correlation""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.CAC + offset: int = 0x00 + desc: str = "7.7. Configuration Access Correlation Extended Capability" + + +class ECapAcs(PcieCapStructure): + """Extended Capability for ACS""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ACS + offset: int = 0x00 + desc: str = "7.7.8 ACS Extended Capability" + + +class ECapAri(PcieCapStructure): + """Extended Capability for ARI""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ARI + offset: int = 0x00 + desc: str = "7.8.7 ARI Extended Capability" + + +class ECapAts(PcieCapStructure): + """Extended Capability for ATS""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ATS + offset: int = 0x00 + desc: str = "10.5.1 ATS Extended Capability" + + +class ECapSriov(PcieCapStructure): + """Extended Capability for SR-IOV""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SRIOV + offset: int = 0x00 + desc: str = "9.3.3 SR-IOV Extended Capability" + + +class ECapMriov(PcieCapStructure): + """Extended Capability for MR-IOV""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MRIOV + offset: int = 0x00 + desc: str = "MR-IOV Extended Capability (MR-IOV)" + + +class ECapMcast(PcieCapStructure): + """Extended Capability for Multicast""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MULTCAST + offset: int = 0x00 + desc: str = "7.9.11 Multicast Extended Capability" + + +class ECapPri(PcieCapStructure): + """Extended Capability for Page Request Interface""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PAGE_REQ + offset: int = 0x00 + desc: str = "10.5.2 Page Request Extended Capability Structure" + + +class ECapAMD(PcieCapStructure): + """Extended Capability for AMD""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.AMD + offset: int = 0x00 + desc: str = "Reserved for AMD" + + +class ECapReba(PcieCapStructure): + """Extended Capability for Resizable BAR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RBAR + offset: int = 0x00 + desc: str = "7.8.6 Resizable BAR Extended Capability" + + +class ECapDpa(PcieCapStructure): + """Extended Capability for Dynamic Power Allocation""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DPA + offset: int = 0x00 + desc: str = "7.9.12 Dynamic Power Allocation Extended Capability (DPA Capability)" + + +class ECapTph(PcieCapStructure): + """Extended Capability for TPH""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.TPH + offset: int = 0x00 + desc: str = "7.9.13.1 TPH Requester Extended Capability Header (Offset 00h)" + + +class ECapLtr(PcieCapStructure): + """Extended Capability for LTR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LTR + offset: int = 0x00 + desc: str = "7.8.2 Latency Tolerance Reporting (LTR) Extended Capability" + + +class LaneErrorStatReg(PcieRegister): + """Lane error status register""" + + desc: str = "Lane Error Status Register" + offset: int = 0x08 + width: int = 32 + lane0_err_stat: PcieBitField = PcieBitField( + bit_mask=0xFFFFFFFF, + desc="Lane Error Status Bits - Each bit indicates if the corresponding Lane detected a Lane-based error.", + ) + + +class ECapSecpci(PcieCapStructure): + """Extended Capability for Secondary PCI Express""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SPCI + offset: int = 0x00 + desc: str = "7.7.3 Secondary PCI Express Extended Capability" + lane_err_stat: LaneErrorStatReg = LaneErrorStatReg() + + +class ECapPmux(PcieCapStructure): + """Extended Capability for PMUX""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PMUX + offset: int = 0x00 + desc: str = "G.5 PMUX Extended Capability" + + +class ECapPasid(PcieCapStructure): + """Extended Capability for PASID""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PASID + offset: int = 0x00 + desc: str = "7.8.8 PASID Extended Capability Structure" + + +class ECapLnr(PcieCapStructure): + """Extended Capability for LN Requester""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LN + offset: int = 0x00 + desc: str = "7.9.14 LN Requester Extended Capability (LNR Capability)" + + +class ECapDpc(PcieCapStructure): + """Extended Capability for DPC""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DPC + offset: int = 0x00 + desc: str = "7.9.15 DPC Extended Capability" + + +class ECapL1pm(PcieCapStructure): + """Extended Capability for L1 PM Substates""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.L1PM + offset: int = 0x00 + desc: str = "7.8.3 L1 PM Substates Extended Capability" + + +class ECapPtm(PcieCapStructure): + """Extended Capability for PTM""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PTM + offset: int = 0x00 + desc: str = "7.9.16 Precision Time Management Extended Capability (PTM Capability)" + + +class ECapMpcie(PcieCapStructure): + """Extended Capability for M-PCIe""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MPCIE + offset: int = 0x00 + desc: str = "PCI Express over M-PHY Extended Capability (M-PCIe)" + + +class ECapFrs(PcieCapStructure): + """Extended Capability for FRS Queueing""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.FRS + offset: int = 0x00 + desc: str = "7.8.9 FRS Queueing Extended Capability" + + +class ECapRtr(PcieCapStructure): + """Extended Capability for Readiness Time Reporting""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RTR + offset: int = 0x00 + desc: str = "7.9.17 Readiness Time Reporting Extended Capability" + + +class ECapDvsec(PcieCapStructure): + """Extended Capability for Designated Vendor-Specific""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DVENDR + offset: int = 0x00 + desc: str = "7.9.6 Designated Vendor-Specific Extended Capability (DVSEC)" + + +class ECapVfRebar(PcieCapStructure): + """Extended Capability for VF Resizable BAR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VFBAR + offset: int = 0x00 + desc: str = "9.3.7.5 VF Resizable BAR Extended Capability" + + +class ECapDlnk(PcieCapStructure): + """Extended Capability for Downstream Link""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DLF + offset: int = 0x00 + desc: str = "7.7.4 Data Link Feature Extended Capability" + + +class Phy16GtEcapHdr(PcieRegister): + """Extended Capability for 16.0 GT/s Physical Layer""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.7.5.1 Physical Layer 16.0 GT/s Extended Capability Header (Offset 00h)" + pcie_ecap_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class Phy16GtEcapStat(PcieRegister): + """Register for 16.0 GT/s Physical Layer Status""" + + offset: int = 0x0C + width: int = 32 + desc: str = "16.0 GT/s Status Register" + eq_16gt_cpl: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization 16.0 GT/s Complete" + ) + eq_16gt_ph1_success: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Equalization 16.0 GT/s Phase 1 Successful" + ) + eq_16gt_ph2_success: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Equalization 16.0 GT/s Phase 2 Successful" + ) + eq_16gt_ph3_success: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Equalization 16.0 GT/s Phase 3 Successful" + ) + lnk_eq_req_16gt: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Link Equalization Request 16.0 GT/s" + ) + + +class ParityMisMatchStat16GT(PcieRegister): + """Register for 16.0 GT/s Parity Mismatch Status""" + + pos: int = 10 + width: int = 32 + offset: int = 0x10 + desc: str = "16.0 GT/s Local Data Parity Mismatch Status Register" + + +class RetimerFstPartiyRetimerMismatchStat16gt(PcieRegister): + """Rgister for 16.0 GT/s First Retimer Data Parity Mismatch Status""" + + pos: int = 14 + width: int = 32 + offset: int = 0x14 + desc: str = "16.0 GT/s First Retimer Data Parity Mismatch Status Register" + + +class RetimerSecPartiyRetimerMismatchStat16gt(PcieRegister): + """Register for 16.0 GT/s Second Retimer Data Parity Mismatch Status""" + + pos: int = 18 + width: int = 32 + offset: int = 0x18 + desc: str = "16.0 GT/s Second Retimer Data Parity Mismatch Status Register" + + +class EqCtl16Gt0(PcieRegister): + """Register for 16.0 GT/s Equalization Control 0""" + + offset: int + width: int = 8 + desc: str = "7.7.5.9 16.0 GT/s Lane Equalization Control Register (Offsets 20h to 3Ch)" + upstream_eq_ctl_16gt_0: PcieBitField = PcieBitField( + bit_mask=0x000000FF, desc="Upstream Equalization Control 16.0 GT/s 0" + ) + downstream_eq_ctl_16gt_0: PcieBitField = PcieBitField( + bit_mask=0x0000FF00, desc="Downstream Equalization Control 16.0 GT/s 0" + ) + + +class ECap16Gt(PcieCapStructure): + """Extended Capability for 16.0 GT/s Physical Layer""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PL_16GT + offset: int = 0x00 + desc: str = "7.7.5 Physical Layer 16.0 GT/s Extended Capability" + header: Phy16GtEcapHdr = Phy16GtEcapHdr() + status: Phy16GtEcapStat = Phy16GtEcapStat() + parity_mismatch_stat: ParityMisMatchStat16GT = ParityMisMatchStat16GT() + retimer_fst_parity_mismatch_stat: RetimerFstPartiyRetimerMismatchStat16gt = ( + RetimerFstPartiyRetimerMismatchStat16gt() + ) + retimer_sec_parity_mismatch_stat: RetimerSecPartiyRetimerMismatchStat16gt = ( + RetimerSecPartiyRetimerMismatchStat16gt() + ) + eq_ctl_16gt_0: EqCtl16Gt0 = EqCtl16Gt0(offset=0x20, desc="16GT/s Equalization Control 0") + eq_ctl_16gt_1: EqCtl16Gt0 = EqCtl16Gt0(offset=0x21, desc="16GT/s Equalization Control 1") + eq_ctl_16gt_2: EqCtl16Gt0 = EqCtl16Gt0(offset=0x22, desc="16GT/s Equalization Control 2") + eq_ctl_16gt_3: EqCtl16Gt0 = EqCtl16Gt0(offset=0x23, desc="16GT/s Equalization Control 3") + eq_ctl_16gt_4: EqCtl16Gt0 = EqCtl16Gt0(offset=0x24, desc="16GT/s Equalization Control 4") + eq_ctl_16gt_5: EqCtl16Gt0 = EqCtl16Gt0(offset=0x25, desc="16GT/s Equalization Control 5") + eq_ctl_16gt_6: EqCtl16Gt0 = EqCtl16Gt0(offset=0x26, desc="16GT/s Equalization Control 6") + eq_ctl_16gt_7: EqCtl16Gt0 = EqCtl16Gt0(offset=0x27, desc="16GT/s Equalization Control 7") + eq_ctl_16gt_8: EqCtl16Gt0 = EqCtl16Gt0(offset=0x28, desc="16GT/s Equalization Control 8") + eq_ctl_16gt_9: EqCtl16Gt0 = EqCtl16Gt0(offset=0x29, desc="16GT/s Equalization Control 9") + eq_ctl_16gt_10: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2A, desc="16GT/s Equalization Control 10") + eq_ctl_16gt_11: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2B, desc="16GT/s Equalization Control 11") + eq_ctl_16gt_12: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2C, desc="16GT/s Equalization Control 12") + eq_ctl_16gt_13: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2D, desc="16GT/s Equalization Control 13") + eq_ctl_16gt_14: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2E, desc="16GT/s Equalization Control 14") + eq_ctl_16gt_15: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2F, desc="16GT/s Equalization Control 15") + eq_ctl_16gt_16: EqCtl16Gt0 = EqCtl16Gt0(offset=0x30, desc="16GT/s Equalization Control 16") + eq_ctl_16gt_17: EqCtl16Gt0 = EqCtl16Gt0(offset=0x31, desc="16GT/s Equalization Control 17") + eq_ctl_16gt_18: EqCtl16Gt0 = EqCtl16Gt0(offset=0x32, desc="16GT/s Equalization Control 18") + eq_ctl_16gt_19: EqCtl16Gt0 = EqCtl16Gt0(offset=0x33, desc="16GT/s Equalization Control 19") + eq_ctl_16gt_20: EqCtl16Gt0 = EqCtl16Gt0(offset=0x34, desc="16GT/s Equalization Control 20") + eq_ctl_16gt_21: EqCtl16Gt0 = EqCtl16Gt0(offset=0x35, desc="16GT/s Equalization Control 21") + eq_ctl_16gt_22: EqCtl16Gt0 = EqCtl16Gt0(offset=0x36, desc="16GT/s Equalization Control 22") + eq_ctl_16gt_23: EqCtl16Gt0 = EqCtl16Gt0(offset=0x37, desc="16GT/s Equalization Control 23") + eq_ctl_16gt_24: EqCtl16Gt0 = EqCtl16Gt0(offset=0x38, desc="16GT/s Equalization Control 24") + eq_ctl_16gt_25: EqCtl16Gt0 = EqCtl16Gt0(offset=0x39, desc="16GT/s Equalization Control 25") + eq_ctl_16gt_26: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3A, desc="16GT/s Equalization Control 26") + eq_ctl_16gt_27: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3B, desc="16GT/s Equalization Control 27") + eq_ctl_16gt_28: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3C, desc="16GT/s Equalization Control 28") + eq_ctl_16gt_29: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3D, desc="16GT/s Equalization Control 29") + eq_ctl_16gt_30: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3E, desc="16GT/s Equalization Control 30") + eq_ctl_16gt_31: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3F, desc="16GT/s Equalization Control 31") + + +class ECapLmr(PcieCapStructure): + """Extended Capability for Lane Margining at the Receiver""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LM + offset: int = 0x00 + desc: str = "7.7.7 Lane Margining at the Receiver Extended Capability" + + +class ECapHierId(PcieCapStructure): + """Extended Capability for Hierarchy ID""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.HID + offset: int = 0x00 + desc: str = "7.9.18 Hierarchy ID Extended Capability" + + +class ECapNpem(PcieCapStructure): + """Extended Capability for Native PCIe Enclosure Management""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.NPEM + offset: int = 0x00 + desc: str = ( + "7.9.20 Native PCIe Enclosure Management Extended Capability (NPEM Extended Capability)" + ) + + +class Phy32GtEcapHdr(PcieRegister): + """Extended Capability for 32.0 GT/s Physical Layer""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.7.6.1 Physical Layer 32.0 GT/s Extended Capability Header (Offset 00h)" + pcie_ecap_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class Phy32GtEcapCapReg(PcieRegister): + """Register for 32.0 GT/s Capabilities""" + + offset: int = 0x04 + width: int = 32 + desc: str = "7.7.6.2 32.0 GT/s Capabilities Register (Offset 04h" + eq_bypass_hi_rate: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization bypass to highest rate Supported" + ) + no_equi_needed: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="No Equalization Needed Supported - When Set" + ) + modified_ts_usage_mode_0_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Modified TS Usage Mode 0 Supported" + ) + modified_ts_usage_mode_1_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Modified TS Usage Mode 1 Supported" + ) + modified_ts_usage_mode_2_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="Modified TS Usage Mode 2 Supported" + ) + modified_ts_reserved_usage_modes: PcieBitField = PcieBitField( + bit_mask=(0x1F << 11), desc="Modified TS Reserved Usage Modes" + ) + + +class Phy32GtStatReg(PcieRegister): + """Register for 32.0 GT/s Status""" + + offset: int = 0x0C + width: int = 32 + desc: str = "32.0 GT/s Status Register" + eq_32gt_cpl: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization 32.0 GT/s Complete" + ) + eq_32gt_ph1_success: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Equalization 32.0 GT/s Phase 1 Successful" + ) + eq_32gt_ph2_success: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Equalization 32.0 GT/s Phase 2 Successful" + ) + eq_32gt_ph3_success: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Equalization 32.0 GT/s Phase 3 Successful" + ) + lnk_eq_req_32gt: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Link Equalization Request 32.0 GT/s" + ) + modified_ts_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Modified TS Received") + rcvd_enhanced_link_behav_ctrl: PcieBitField = PcieBitField( + bit_mask=(0x3 << 6), desc="Received Enhanced Link Behavior Control" + ) + tx_precoding_on: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="Transmitter Precoding On") + tx_precoding_req: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Transmitter Precode Request" + ) + no_eq_needed_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="No Equalization Needed Received" + ) + + +class TransReceived32GTData1(PcieRegister): + """Register for 32.0 GT/s Received Modified TS Data 1""" + + offset: int = 0x10 + width: int = 32 + desc: str = "7.7.6.5 Received Modified TS Data 1 Register (Offset 10h)" + rcvd_mod_ts_usage_mode: PcieBitField = PcieBitField( + bit_mask=(0x7 << 0), desc="Received Modified TS Usage Mode" + ) + rcvd_mod_ts_info_1: PcieBitField = PcieBitField( + bit_mask=(0xFFF << 3), desc="Received Modified TS Information 1" + ) + rcvd_mod_ts_vendor_id: PcieBitField = PcieBitField( + bit_mask=(0xFFFF << 16), desc="Received Modified TS Vendor ID" + ) + + +# 23:0 Received Modified TS Information 2 +# 25:24 Alternate Protocol Negotiation Status +class TransReceived32GTData2(PcieRegister): + """Register for 32.0 GT/s Received Modified TS Data 2""" + + offset: int = 0x14 + width: int = 32 + desc: str = "7.7.6.6 Received Modified TS Data 2 Register (Offset 14h)" + rcvd_mod_ts_info_2: PcieBitField = PcieBitField( + bit_mask=(0x7FF << 0), desc="Received Modified TS Information 2" + ) + alt_proto_neg_status: PcieBitField = PcieBitField( + bit_mask=(0x3 << 24), desc="Alternate Protocol Negotiation Status" + ) + + +class EqCtl32Gt0(PcieRegister): + """Equalization Control for 32.0 GT/s""" + + offset: int + width: int = 8 + desc: str = "7.7.6.9 32.0 GT/s Lane Equalization Control Register (Offset 20h to 3Ch)" + upstream_eq_ctl_32gt_0: PcieBitField = PcieBitField( + bit_mask=0x000000FF, desc="Upstream Equalization Control 32.0 GT/s 0" + ) + downstream_eq_ctl_32gt_0: PcieBitField = PcieBitField( + bit_mask=0x0000FF00, desc="Downstream Equalization Control 32.0 GT/s 0" + ) + + +class ECap32Gts(PcieCapStructure): + """Extended Capability for 32.0 GT/s Physical Layer""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PL_32GT + offset: int = 0x00 + desc: str = "7.7.6 Physical Layer 32.0 GT/s Extended Capability" + header: Phy32GtEcapHdr = Phy32GtEcapHdr() + cap_reg: Phy32GtEcapCapReg = Phy32GtEcapCapReg() + status: Phy32GtStatReg = Phy32GtStatReg() + recv_data_1: TransReceived32GTData1 = TransReceived32GTData1() + recv_data_2: TransReceived32GTData2 = TransReceived32GTData2() + trans_data_1: TransReceived32GTData1 = TransReceived32GTData1(offset=0x18) + trans_data_2: TransReceived32GTData2 = TransReceived32GTData2(offset=0x1C) + eq_ctl_32gt_0: EqCtl32Gt0 = EqCtl32Gt0(offset=0x20, desc="32GT/s Equalization Control 0") + eq_ctl_32gt_1: EqCtl32Gt0 = EqCtl32Gt0(offset=0x21, desc="32GT/s Equalization Control 1") + eq_ctl_32gt_2: EqCtl32Gt0 = EqCtl32Gt0(offset=0x22, desc="32GT/s Equalization Control 2") + eq_ctl_32gt_3: EqCtl32Gt0 = EqCtl32Gt0(offset=0x23, desc="32GT/s Equalization Control 3") + eq_ctl_32gt_4: EqCtl32Gt0 = EqCtl32Gt0(offset=0x24, desc="32GT/s Equalization Control 4") + eq_ctl_32gt_5: EqCtl32Gt0 = EqCtl32Gt0(offset=0x25, desc="32GT/s Equalization Control 5") + eq_ctl_32gt_6: EqCtl32Gt0 = EqCtl32Gt0(offset=0x26, desc="32GT/s Equalization Control 6") + eq_ctl_32gt_7: EqCtl32Gt0 = EqCtl32Gt0(offset=0x27, desc="32GT/s Equalization Control 7") + eq_ctl_32gt_8: EqCtl32Gt0 = EqCtl32Gt0(offset=0x28, desc="32GT/s Equalization Control 8") + eq_ctl_32gt_9: EqCtl32Gt0 = EqCtl32Gt0(offset=0x29, desc="32GT/s Equalization Control 9") + eq_ctl_32gt_10: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2A, desc="32GT/s Equalization Control 10") + eq_ctl_32gt_11: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2B, desc="32GT/s Equalization Control 11") + eq_ctl_32gt_12: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2C, desc="32GT/s Equalization Control 12") + eq_ctl_32gt_13: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2D, desc="32GT/s Equalization Control 13") + eq_ctl_32gt_14: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2E, desc="32GT/s Equalization Control 14") + eq_ctl_32gt_15: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2F, desc="32GT/s Equalization Control 15") + eq_ctl_32gt_32: EqCtl32Gt0 = EqCtl32Gt0(offset=0x30, desc="32GT/s Equalization Control 32") + eq_ctl_32gt_17: EqCtl32Gt0 = EqCtl32Gt0(offset=0x31, desc="32GT/s Equalization Control 17") + eq_ctl_32gt_18: EqCtl32Gt0 = EqCtl32Gt0(offset=0x32, desc="32GT/s Equalization Control 18") + eq_ctl_32gt_19: EqCtl32Gt0 = EqCtl32Gt0(offset=0x33, desc="32GT/s Equalization Control 19") + eq_ctl_32gt_20: EqCtl32Gt0 = EqCtl32Gt0(offset=0x34, desc="32GT/s Equalization Control 20") + eq_ctl_32gt_21: EqCtl32Gt0 = EqCtl32Gt0(offset=0x35, desc="32GT/s Equalization Control 21") + eq_ctl_32gt_22: EqCtl32Gt0 = EqCtl32Gt0(offset=0x36, desc="32GT/s Equalization Control 22") + eq_ctl_32gt_23: EqCtl32Gt0 = EqCtl32Gt0(offset=0x37, desc="32GT/s Equalization Control 23") + eq_ctl_32gt_24: EqCtl32Gt0 = EqCtl32Gt0(offset=0x38, desc="32GT/s Equalization Control 24") + eq_ctl_32gt_25: EqCtl32Gt0 = EqCtl32Gt0(offset=0x39, desc="32GT/s Equalization Control 25") + eq_ctl_32gt_26: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3A, desc="32GT/s Equalization Control 26") + eq_ctl_32gt_27: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3B, desc="32GT/s Equalization Control 27") + eq_ctl_32gt_28: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3C, desc="32GT/s Equalization Control 28") + eq_ctl_32gt_29: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3D, desc="32GT/s Equalization Control 29") + eq_ctl_32gt_30: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3E, desc="32GT/s Equalization Control 30") + eq_ctl_32gt_31: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3F, desc="32GT/s Equalization Control 31") + + +class ECapAltProtocol(PcieCapStructure): + """Extended Capability for Alternate Protocol""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ALT_PROTOCOL + offset: int = 0x00 + desc: str = "7.9.21 Alternate Protocol Extended Capability" + + +class ECapSfi(PcieCapStructure): + """Extended Capability for System Firmware Intermediary""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SFI + offset: int = 0x00 + desc: str = "7.9.23 System Firmware Intermediary (SFI) Extended Capability" + + +class ECapDoe(PcieCapStructure): + """Extended Capability for DOE""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DOE + offset: int = 0x00 + desc: str = "Cap DOE" + + +class ECapIntegrityDoe(PcieCapStructure): + """Extended Capability for Integrity DOE""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.INT_DOE + offset: int = 0x00 + desc: str = "Int Cap DOE" + + +class PcieCfgSpace(BaseModel): + """Holds the base registers and capability structures of a PCIe device + + - type_0_configuration: Type 0 Configuration Space, this is both the shared registers and the type0 specific registers + - type_1_configuration: Type 1 Configuration Space, this is both the shared registers and the type1 specific registers + - capability_pointers: A dictionary of capability pointers to the offset of the capability structure + - extended_capability_pointers: A dictionary of extended capability pointers to the offset of the extended capability structure + - cap_structure: A dictionary of capability structures + - ecap_structure: A dictionary of extended capability structures + + """ + + type_0_configuration: Type0Configuration = Type0Configuration() + type_1_configuration: Type1Configuration = Type1Configuration() + capability_pointers: Dict[CapabilityEnum, int] = {} + extended_capability_pointers: Dict[ExtendedCapabilityEnum, int] = {} + # SerializeAsAny is used to allow for the structure to be any of the capability structures so all registers and fields are dumped + cap_structure: Dict[CapabilityEnum, SerializeAsAny[PcieCapStructure]] = {} + ecap_structure: Dict[ExtendedCapabilityEnum, SerializeAsAny[PcieCapStructure]] = {} + + def get_struct(self, struct: type[AnyCap]) -> Optional[AnyCap]: + """Get a structure from the cap_structure or ecap_structure based on the type + + Parameters + ---------- + struct : type[AnyCap] + The structure to get from the cap_structure or ecap_structure + + Returns + ------- + Optional[AnyCap] + The structure if it exists, otherwise None + """ + if struct == Type0Configuration: + return self.type_0_configuration # type: ignore[return-value] + if struct == Type1Configuration: + return self.type_1_configuration # type: ignore[return-value] + + if hasattr(struct, "cap_id"): + cap = self.cap_structure.get(struct.cap_id, None) # type: ignore[attr-defined] + if cap: + return cap # type: ignore[return-value] + ecap = self.ecap_structure.get(struct.cap_id, None) # type: ignore[attr-defined] + if ecap: + return ecap # type: ignore[return-value] + return None + + @field_validator("extended_capability_pointers", mode="before") + @classmethod + def str_to_enum_extended(cls, dict_in: Dict[str, int]) -> Dict[Enum, int]: + """Converts a dictionary with string keys to Enum keys + + Parameters + ---------- + dict_in : Dict[str, int] + The dictionary to convert + + Returns + ------- + dict[Enum, int] + The dictionary with Enum keys + """ + dict_out: Dict[Enum, int] = {} + for k, v in dict_in.items(): + if isinstance(k, str): + dict_out[ExtendedCapabilityEnum(int(k))] = v + return dict_out + + @field_validator("capability_pointers", mode="before") + @classmethod + def str_to_enum(cls, dict_in: Dict[str, int]) -> Dict[Enum, int]: + """Converts a dictionary with string keys to Enum keys + + Parameters + ---------- + dict_in : Dict[str, int] + The dictionary to convert + + Returns + ------- + dict[Enum, int] + The dictionary with Enum keys + """ + dict_out: Dict[Enum, int] = {} + for k, v in dict_in.items(): + if isinstance(k, str): + dict_out[CapabilityEnum(int(k))] = v + else: + dict_out[k] = v + return dict_out + + @field_validator("cap_structure", mode="before") + @classmethod + def validate_cap_structure( + cls, cap_in: Dict[Union[int, str, CapabilityEnum], SerializeAsAny[PcieCapStructure]] + ) -> Dict[CapabilityEnum, PcieCapStructure]: + """This adjust's a generic PcieCapStructure dict into a specific PcieCapStructure and therefore populating all registers and fields""" + return cls.conform_json_dict_to_cap_struct(cap_in, CapabilityEnum) # type: ignore[arg-type, return-value] + + @field_validator("ecap_structure", mode="before") + @classmethod + def validate_ecap_structure( + cls, + ecap_in: Dict[Union[int, str, ExtendedCapabilityEnum], SerializeAsAny[PcieCapStructure]], + ) -> Dict[ExtendedCapabilityEnum, PcieCapStructure]: + """This adjust's a generic PcieCapStructure dict into a specific PcieCapStructure and therefore populating all registers and fields""" + return cls.conform_json_dict_to_cap_struct(ecap_in, ExtendedCapabilityEnum) # type: ignore[arg-type, return-value] + + @classmethod + def conform_json_dict_to_cap_struct( + cls, + cap_structure_in: Dict[Union[str, int, Enum], PcieCapStructure], + enum_type: type[Enum], + ) -> Dict[Enum, PcieCapStructure]: + """This is needed for when the model is loaded from a json/dict. Since the type of PcieCapStructure + does not fully describe which cap structure it is and which registers it has, pydantic just assumes + it is the base class. To override this behaviour the cap_id is used to discover which structure it + really should be. This is only done if the value of the validated attribute is a dict + + Parameters + ---------- + cap_structure_in : Dict[Union[str, int, Enum], PcieCapStructure] + A capability structure to fix from json input + enum_type : type[Enum] + Which enum to use for values + + Returns + ------- + dict[Enum, PcieCapStructure] + A dict where the values are now the fully defined structure instead of the base class + """ + cap_out: Dict[Enum, PcieCapStructure] = {} + for k, v in cap_structure_in.items(): + if isinstance(v, dict): + if isinstance(k, str): + enum = enum_type(int(k)) + elif isinstance(k, enum_type): + enum = k + cls = cap_id_to_class(enum) + cap_out[enum] = cls(**v) + else: + cap_out[k] = v # type: ignore[index] + return cap_out + + +class PcieDataModel(DataModel): + """class for collection of PCIe data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + + - pcie_cfg_space: A dictionary of PCIe cfg space for the GPUs obtained with setpci command + - lspci_verbose: Verbose collection of PCIe data + - lspci_verbose_tree: Tree view of PCIe data + - lspci_path: Path view of PCIe data for the GPUs + - lspci_hex: Hex view of PCIe data for the GPUs + + """ + + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace] + vf_pcie_cfg_space: Optional[Dict[BdfStr, PcieCfgSpace]] = None diff --git a/nodescraper/plugins/inband/pcie/pcie_plugin.py b/nodescraper/plugins/inband/pcie/pcie_plugin.py new file mode 100644 index 00000000..0e4f3eb0 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_plugin.py @@ -0,0 +1,43 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .analyzer_args import PcieAnalyzerArgs +from .pcie_analyzer import PcieAnalyzer +from .pcie_collector import PcieCollector +from .pcie_data import PcieDataModel + + +class PciePlugin(InBandDataPlugin[PcieDataModel, None, PcieAnalyzerArgs]): + """Plugin for collection and analysis of PCIe data""" + + DATA_MODEL = PcieDataModel + + COLLECTOR = PcieCollector + + ANALYZER = PcieAnalyzer + + ANALYZER_ARGS = PcieAnalyzerArgs diff --git a/nodescraper/plugins/inband/rocm/analyzer_args.py b/nodescraper/plugins/inband/rocm/analyzer_args.py index 40a11ebc..ff0751eb 100644 --- a/nodescraper/plugins/inband/rocm/analyzer_args.py +++ b/nodescraper/plugins/inband/rocm/analyzer_args.py @@ -25,13 +25,15 @@ ############################################################################### from typing import Union -from pydantic import BaseModel, Field, field_validator +from pydantic import Field, field_validator +from nodescraper.models.analyzerargs import AnalyzerArgs from nodescraper.plugins.inband.rocm.rocmdata import RocmDataModel -class RocmAnalyzerArgs(BaseModel): +class RocmAnalyzerArgs(AnalyzerArgs): exp_rocm: Union[str, list] = Field(default_factory=list) + exp_rocm_latest: str = Field(default="") @field_validator("exp_rocm", mode="before") @classmethod @@ -59,4 +61,6 @@ def build_from_model(cls, datamodel: RocmDataModel) -> "RocmAnalyzerArgs": Returns: RocmAnalyzerArgs: instance of analyzer args class """ - return cls(exp_rocm=datamodel.rocm_version) + return cls( + exp_rocm=datamodel.rocm_version, exp_rocm_latest=datamodel.rocm_latest_versioned_path + ) diff --git a/nodescraper/plugins/inband/rocm/rocm_analyzer.py b/nodescraper/plugins/inband/rocm/rocm_analyzer.py index 17762af7..1131d665 100644 --- a/nodescraper/plugins/inband/rocm/rocm_analyzer.py +++ b/nodescraper/plugins/inband/rocm/rocm_analyzer.py @@ -61,17 +61,40 @@ def analyze_data( if data.rocm_version == rocm_version: self.result.message = "ROCm version matches expected" self.result.status = ExecutionStatus.OK + break + else: + # No matching version found + self.result.message = ( + f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}" + ) + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.SW_DRIVER, + description=f"{self.result.message}", + data={"expected": args.exp_rocm, "actual": data.rocm_version}, + priority=EventPriority.CRITICAL, + console_log=True, + ) + return self.result + + # validate rocm_latest if provided in args + if args.exp_rocm_latest: + if data.rocm_latest_versioned_path != args.exp_rocm_latest: + self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}" + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.SW_DRIVER, + description=f"{self.result.message}", + data={ + "expected": args.exp_rocm_latest, + "actual": data.rocm_latest_versioned_path, + }, + priority=EventPriority.CRITICAL, + console_log=True, + ) return self.result + else: + # Update message to include rocm_latest validation result + self.result.message = f"ROCm version matches expected. ROCm latest path validated: {data.rocm_latest_versioned_path}" - self.result.message = ( - f"ROCm version mismatch! Expected: {rocm_version}, actual: {args.exp_rocm}" - ) - self.result.status = ExecutionStatus.ERROR - self._log_event( - category=EventCategory.SW_DRIVER, - description=f"{self.result.message}", - data={"expected": args.exp_rocm, "actual": data.rocm_version}, - priority=EventPriority.CRITICAL, - console_log=True, - ) return self.result diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 37470f68..f7692e45 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -26,8 +26,10 @@ from typing import Optional from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult +from nodescraper.utils import strip_ansi_codes from .rocmdata import RocmDataModel @@ -42,6 +44,14 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): "/opt/rocm/.info/version-rocm", "/opt/rocm/.info/version", ] + CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" + CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" + CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" + CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" + CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" + CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" + CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" + CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: """Collect ROCm version data from the system. @@ -49,33 +59,134 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: Returns: tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available. """ - version_paths = [ - "/opt/rocm/.info/version-rocm", - "/opt/rocm/.info/version", - ] - rocm_data = None for path in self.CMD_VERSION_PATHS: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: - rocm_data = RocmDataModel(rocm_version=res.stdout) - self._log_event( - category="ROCM_VERSION_READ", - description="ROCm version data collected", - data=rocm_data.model_dump(), - priority=EventPriority.INFO, - ) - self.result.message = f"ROCm: {rocm_data.model_dump()}" - self.result.status = ExecutionStatus.OK - break + try: + rocm_data = RocmDataModel(rocm_version=res.stdout) + self._log_event( + category="ROCM_VERSION_READ", + description="ROCm version data collected", + data=rocm_data.model_dump(include={"rocm_version"}), + priority=EventPriority.INFO, + ) + self.result.message = f"ROCm version: {rocm_data.rocm_version}" + self.result.status = ExecutionStatus.OK + break + except ValueError as e: + self._log_event( + category=EventCategory.OS, + description=f"Invalid ROCm version format: {res.stdout}", + data={"version": res.stdout, "error": str(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.message = f"Invalid ROCm version format: {res.stdout}" + self.result.status = ExecutionStatus.ERROR + return self.result, None else: self._log_event( category=EventCategory.OS, - description=f"Unable to read ROCm version from {version_paths}", + description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}", data={"raw_output": res.stdout}, priority=EventPriority.ERROR, ) + # Collect additional ROCm data if version was found + if rocm_data: + # Collect latest versioned ROCm path (rocm-[3-7]*) + versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST) + if versioned_path_res.exit_code == 0: + rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() + + # Collect all ROCm paths as list + all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS) + if all_paths_res.exit_code == 0: + rocm_data.rocm_all_paths = [ + path.strip() + for path in all_paths_res.stdout.strip().split("\n") + if path.strip() + ] + + # Determine ROCm path for commands that need it + rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm" + + # Collect rocminfo output as list of lines with ANSI codes stripped + rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path) + rocminfo_res = self._run_sut_cmd(rocminfo_cmd) + rocminfo_artifact_content = "" + if rocminfo_res.exit_code == 0: + # Split into lines and strip ANSI codes from each line + rocm_data.rocminfo = [ + strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n") + ] + rocminfo_artifact_content += "=" * 80 + "\n" + rocminfo_artifact_content += "ROCMNFO OUTPUT\n" + rocminfo_artifact_content += "=" * 80 + "\n\n" + rocminfo_artifact_content += rocminfo_res.stdout + + # Collect ld.so.conf ROCm entries + ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF) + if ld_conf_res.exit_code == 0: + rocm_data.ld_conf_rocm = [ + line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip() + ] + + # Collect ROCm libraries from ldconfig + rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS) + if rocm_libs_res.exit_code == 0: + rocm_data.rocm_libs = [ + line.strip() + for line in rocm_libs_res.stdout.strip().split("\n") + if line.strip() + ] + + # Collect ROCm-related environment variables + env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS) + if env_vars_res.exit_code == 0: + rocm_data.env_vars = [ + line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip() + ] + + # Collect clinfo output + clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path) + clinfo_res = self._run_sut_cmd(clinfo_cmd) + + # Always append clinfo section to artifact, even if empty or failed + if rocminfo_artifact_content: + rocminfo_artifact_content += "\n\n" + rocminfo_artifact_content += "=" * 80 + "\n" + rocminfo_artifact_content += "CLINFO OUTPUT\n" + rocminfo_artifact_content += "=" * 80 + "\n\n" + + if clinfo_res.exit_code == 0: + rocm_data.clinfo = [ + strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n") + ] + rocminfo_artifact_content += clinfo_res.stdout + else: + # Add error information if clinfo failed + rocminfo_artifact_content += f"Command: {clinfo_res.command}\n" + rocminfo_artifact_content += f"Exit Code: {clinfo_res.exit_code}\n" + if clinfo_res.stderr: + rocminfo_artifact_content += f"Error: {clinfo_res.stderr}\n" + if clinfo_res.stdout: + rocminfo_artifact_content += f"Output: {clinfo_res.stdout}\n" + + # Add combined rocminfo and clinfo output as a text file artifact + if rocminfo_artifact_content: + self.result.artifacts.append( + TextFileArtifact(filename="rocminfo.log", contents=rocminfo_artifact_content) + ) + + # Collect KFD process list + kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC) + if kfd_proc_res.exit_code == 0: + rocm_data.kfd_proc = [ + proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip() + ] + if not rocm_data: self._log_event( category=EventCategory.OS, diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index 2c5388e8..f0fb2618 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,6 +24,7 @@ # ############################################################################### import re +from typing import List from pydantic import field_validator @@ -32,6 +33,14 @@ class RocmDataModel(DataModel): rocm_version: str + rocminfo: List[str] = [] + rocm_latest_versioned_path: str = "" + rocm_all_paths: List[str] = [] + ld_conf_rocm: List[str] = [] + rocm_libs: List[str] = [] + env_vars: List[str] = [] + clinfo: List[str] = [] + kfd_proc: List[str] = [] @field_validator("rocm_version") @classmethod diff --git a/nodescraper/plugins/inband/storage/analyzer_args.py b/nodescraper/plugins/inband/storage/analyzer_args.py index 1f44d1d3..413c8ec0 100644 --- a/nodescraper/plugins/inband/storage/analyzer_args.py +++ b/nodescraper/plugins/inband/storage/analyzer_args.py @@ -25,10 +25,12 @@ ############################################################################### from typing import Optional -from pydantic import BaseModel, Field +from pydantic import Field +from nodescraper.models.analyzerargs import AnalyzerArgs -class StorageAnalyzerArgs(BaseModel): + +class StorageAnalyzerArgs(AnalyzerArgs): min_required_free_space_abs: Optional[str] = None min_required_free_space_prct: Optional[int] = None ignore_devices: Optional[list[str]] = Field(default_factory=list) diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 9b1fb88c..de3a0956 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -23,11 +23,12 @@ # SOFTWARE. # ############################################################################### +import inspect import os import re import traceback from enum import Enum -from typing import TypeVar +from typing import Any, List, Optional, Set, Type, TypeVar, Union, get_args, get_origin T = TypeVar("T") @@ -171,6 +172,50 @@ def bytes_to_human_readable(input_bytes: int) -> str: return f"{gb}GB" +def find_annotation_in_container( + annotation, target_type +) -> Union[tuple[Any, list[Any]], tuple[None, list[Any]]]: + """Recursively search for a target type in an annotation and return the target type and the containers + supported container types are generic types, Callable, Tuple, Union, Literal, Final, ClassVar + and Annotated. If the target type is not found then None is returned. + + Examples: + find_annotation_in_container(Union[int, str], int) -> int, [Union[int, str]] + find_annotation_in_container(Union[int, dict[str, list[MyClass]]], MyClass) -> MyClass, [list,dict,union] + find_annotation_in_container(Union[int, str], MyClass) -> None, [] + + Parameters + ---------- + annotation : type + A type annotation to search for the target type in. + target_type : type + The target type to search for. + + Returns + ------- + Union[tuple[Any, list[Any]], tuple[None, []]] + The target type and the containers if found, otherwise None and an empty list. + """ + containers: list[Any] = [] + origin = get_origin(annotation) + args = get_args(annotation) + if len(args) == 0 and issubclass(annotation, target_type): + return annotation, containers + if isinstance(args, tuple): + for item in args: + item_args = get_args(item) + if len(item_args) > 0: + result, container = find_annotation_in_container(item, target_type) + containers += container + if result: + containers.append(origin) + return result, containers + if len(get_args(item)) == 0 and issubclass(item, target_type): + containers.append(origin) + return item, containers + return None, [] + + def shell_quote(s: str) -> str: """Single quote fix @@ -201,3 +246,167 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str: middle = base[:-3] if base.endswith(".gz") else base return f"{prefix}{middle}.log" + + +def apply_bit_mask(in_hex: str, bit_mask_hex: str) -> Optional[str]: + """Extracts bit offset from bit mask, applies the bit mask and offset. + + Args: + in_hex (str): Hexadecimal input + bit_mask (str): Hexadecimal bit mask + + Returns: + str: hexadecimal output after applying bit mask and offset + """ + if not is_hex(hex_in=in_hex) or not is_hex(hex_in=bit_mask_hex): + return None + in_dec = hex_to_int(in_hex) + bit_mask_dec = hex_to_int(bit_mask_hex) + bit_offset = get_bit_offset(bit_mask_hex) + if in_dec is None or bit_mask_dec is None or bit_offset is None: + return None + out_dec = (in_dec & bit_mask_dec) >> bit_offset + return hex(out_dec) + + +def apply_bit_mask_int(in_int: int, bit_mask_int: int) -> Optional[int]: + """Extracts bit offset from bit mask, applies the bit mask and offset. + + Args: + in_int (int): integer input + bit_mask_int (int): integer bit mask + + Returns: + int: integer output after applying bit mask and offset + """ + out_int = (in_int & bit_mask_int) >> get_bit_offset_int(bit_mask_int) + return out_int + + +def get_bit_offset_int(bit_mask: int) -> int: + """Extracts the bit offset from bit mask. + For ex, bit_mask = 0x0010 (hex) -> 0b00010000 (bin) + Returns bit offset of 4 (bit position of the "1") + + Args: + bit_mask (int): hex bit mask + + Returns: + int: bit offset + """ + bit_pos = 0 + while bit_mask > 0: + if bit_mask % 2 == 1: + return bit_pos + bit_mask = bit_mask >> 1 + bit_pos += 1 + + return 0 + + +def get_bit_offset(bit_mask: str) -> Optional[int]: + """Extracts the bit offset from bit mask. + For ex, bit_mask = "0010" (hex) -> 0b00010000 (bin) + Returns bit offset of 4 (bit position of the "1") + + Args: + bit_mask (str): hex bit mask + + Returns: + int: bit offset + """ + bit_mask_int = hex_to_int(bit_mask) + bit_pos = 0 + if bit_mask_int is None: + return None + while bit_mask_int > 0: + if bit_mask_int % 2 == 1: + return bit_pos + bit_mask_int = bit_mask_int >> 1 + bit_pos += 1 + + return 0 + + +def get_all_subclasses(cls: Type[T]) -> Set[Type[T]]: + """Get an iterable with all subclasses of this class (not including this class) + Subclasses are presented in no particular order + + Returns: + An iterable of all subclasses of this class + """ + subclasses: Set[Type[T]] = set() + for subclass in cls.__subclasses__(): + subclasses = subclasses.union(get_all_subclasses(subclass)) + if not inspect.isabstract(subclass): + subclasses.add(subclass) + return subclasses + + +def get_subclass( + class_name: str, class_type: Type[T], sub_classes: Optional[List[Type[T]]] +) -> Optional[Type[T]]: + """get a subclass with a given name + + Args: + class_name (str): target sub class name + class_type (Type[T]): class type + sub_classes (Optional[List[Type[T]]]): list of sub classes to check + + Returns: + Optional[Type[T]]: sub class or None if no sub class with target name is found + """ + if not sub_classes: + sub_classes = list(get_all_subclasses(class_type)) + + for sub_class in sub_classes: + if sub_class.__name__ == class_name: + return sub_class + return None + + +def hex_to_int(hex_in: str) -> Optional[int]: + """Converts given hex string to int + + Args: + hex_in: hexadecimal string + + Returns: + int: hexadecimal converted to int + """ + try: + if not is_hex(hex_in): + return None + return int(hex_in, 16) + except TypeError: + return None + + +def is_hex(hex_in: str) -> bool: + """Returns True or False based on whether the input hexadecimal is indeed hexadecimal + + Args: + hex_in: hexadecimal string + + Returns: + bool: True/False whether the input hexadecimal is indeed hexadecimal + """ + if not hex_in: + return False + + hex_pattern = re.compile(r"^(0x)?[0-9a-fA-F]+$") + return bool(hex_pattern.fullmatch(hex_in)) + + +def strip_ansi_codes(text: str) -> str: + """ + Remove ANSI escape codes from text. + + Args: + text (str): The text string containing ANSI escape codes. + + Returns: + str: The text with ANSI escape codes removed. + """ + ansi_escape = re.compile(r"\x1b\[[0-9;]*m") + return ansi_escape.sub("", text) diff --git a/pyproject.toml b/pyproject.toml index cef078bb..8e673376 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "node-scraper" +name = "amd-node-scraper" version = "0.0.1" description = "A framework for automated error detection and data collection" authors = [] diff --git a/test/functional/__init__.py b/test/functional/__init__.py new file mode 100644 index 00000000..711ec35f --- /dev/null +++ b/test/functional/__init__.py @@ -0,0 +1,26 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for node-scraper.""" diff --git a/test/functional/conftest.py b/test/functional/conftest.py new file mode 100644 index 00000000..77ded955 --- /dev/null +++ b/test/functional/conftest.py @@ -0,0 +1,57 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Shared fixtures for functional tests.""" + +import subprocess +import sys +from typing import List + +import pytest + + +@pytest.fixture +def run_cli_command(): + """Fixture that returns a function to run CLI commands.""" + + def _run_command(args: List[str], check: bool = False): + """Run a node-scraper CLI command. + + Args: + args: List of command-line arguments + check: If True, raise CalledProcessError on non-zero exit + + Returns: + subprocess.CompletedProcess instance + """ + cmd = [sys.executable, "-m", "nodescraper.cli.cli"] + args + return subprocess.run( + cmd, + capture_output=True, + text=True, + check=check, + ) + + return _run_command diff --git a/test/functional/fixtures/bios_plugin_config.json b/test/functional/fixtures/bios_plugin_config.json new file mode 100644 index 00000000..10a0c76f --- /dev/null +++ b/test/functional/fixtures/bios_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "BiosPlugin": { + "analysis_args": { + "exp_bios_version": "3.5", + "regex_match": false + } + } + }, + "result_collators": {}, + "name": "BiosPlugin config", + "desc": "Config for testing BiosPlugin" +} diff --git a/test/functional/fixtures/cmdline_plugin_config.json b/test/functional/fixtures/cmdline_plugin_config.json new file mode 100644 index 00000000..2b6c5199 --- /dev/null +++ b/test/functional/fixtures/cmdline_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "CmdlinePlugin": { + "analysis_args": { + "required_cmdline": "selinux=0", + "banned_cmdline": [] + } + } + }, + "result_collators": {}, + "name": "CmdlinePlugin config", + "desc": "Config for testing CmdlinePlugin" +} diff --git a/test/functional/fixtures/dimm_plugin_config.json b/test/functional/fixtures/dimm_plugin_config.json new file mode 100644 index 00000000..e97bd039 --- /dev/null +++ b/test/functional/fixtures/dimm_plugin_config.json @@ -0,0 +1,9 @@ +{ + "global_args": {}, + "plugins": { + "DimmPlugin": {} + }, + "result_collators": {}, + "name": "DimmPlugin config", + "desc": "Config for testing DimmPlugin" +} diff --git a/test/functional/fixtures/dkms_plugin_config.json b/test/functional/fixtures/dkms_plugin_config.json new file mode 100644 index 00000000..6476fa93 --- /dev/null +++ b/test/functional/fixtures/dkms_plugin_config.json @@ -0,0 +1,15 @@ +{ + "global_args": {}, + "plugins": { + "DkmsPlugin": { + "analysis_args": { + "dkms_status": "amdgpu/6.11", + "dkms_version": "dkms-3.1", + "regex_match": true + } + } + }, + "result_collators": {}, + "name": "DkmsPlugin config", + "desc": "Config for testing DkmsPlugin" +} diff --git a/test/functional/fixtures/dmesg_plugin_config.json b/test/functional/fixtures/dmesg_plugin_config.json new file mode 100644 index 00000000..6b40439a --- /dev/null +++ b/test/functional/fixtures/dmesg_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "analysis_args": { + "check_unknown_dmesg_errors": true, + "exclude_category": null + } + } + }, + "result_collators": {}, + "name": "DmesgPlugin config", + "desc": "Config for testing DmesgPlugin" +} diff --git a/test/functional/fixtures/journal_plugin_config.json b/test/functional/fixtures/journal_plugin_config.json new file mode 100644 index 00000000..2b379e23 --- /dev/null +++ b/test/functional/fixtures/journal_plugin_config.json @@ -0,0 +1,13 @@ +{ + "global_args": {}, + "plugins": { + "JournalPlugin": { + "collection_args": { + "boot": 2 + } + } + }, + "result_collators": {}, + "name": "JournalPlugin config", + "desc": "Config for testing JournalPlugin" +} diff --git a/test/functional/fixtures/kernel_module_plugin_config.json b/test/functional/fixtures/kernel_module_plugin_config.json new file mode 100644 index 00000000..52b5cc8c --- /dev/null +++ b/test/functional/fixtures/kernel_module_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "KernelModulePlugin": { + "analysis_args": { + "kernel_modules": {}, + "regex_filter": ["amd"] + } + } + }, + "result_collators": {}, + "name": "KernelModulePlugin config", + "desc": "Config for testing KernelModulePlugin" +} diff --git a/test/functional/fixtures/kernel_plugin_config.json b/test/functional/fixtures/kernel_plugin_config.json new file mode 100644 index 00000000..7c2cec92 --- /dev/null +++ b/test/functional/fixtures/kernel_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "KernelPlugin": { + "analysis_args": { + "exp_kernel": "5.11-generic", + "regex_match": false + } + } + }, + "result_collators": {}, + "name": "KernelPlugin config", + "desc": "Config for testing KernelPlugin" +} diff --git a/test/functional/fixtures/memory_plugin_config.json b/test/functional/fixtures/memory_plugin_config.json new file mode 100644 index 00000000..732e9847 --- /dev/null +++ b/test/functional/fixtures/memory_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "MemoryPlugin": { + "analysis_args": { + "ratio": 0.66, + "memory_threshold": "30Gi" + } + } + }, + "result_collators": {}, + "name": "MemoryPlugin config", + "desc": "Config for testing MemoryPlugin" +} diff --git a/test/functional/fixtures/network_plugin_config.json b/test/functional/fixtures/network_plugin_config.json new file mode 100644 index 00000000..aa4b6bc0 --- /dev/null +++ b/test/functional/fixtures/network_plugin_config.json @@ -0,0 +1,11 @@ +{ + "global_args": {}, + "plugins": { + "NetworkPlugin": { + "analysis_args": {} + } + }, + "result_collators": {}, + "name": "NetworkPlugin config", + "desc": "Config for testing NetworkPlugin" +} diff --git a/test/functional/fixtures/nvme_plugin_config.json b/test/functional/fixtures/nvme_plugin_config.json new file mode 100644 index 00000000..e7e2e77b --- /dev/null +++ b/test/functional/fixtures/nvme_plugin_config.json @@ -0,0 +1,9 @@ +{ + "global_args": {}, + "plugins": { + "NvmePlugin": {} + }, + "result_collators": {}, + "name": "NvmePlugin config", + "desc": "Config for testing NvmePlugin" +} diff --git a/test/functional/fixtures/os_plugin_config.json b/test/functional/fixtures/os_plugin_config.json new file mode 100644 index 00000000..583be54f --- /dev/null +++ b/test/functional/fixtures/os_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "OsPlugin": { + "analysis_args": { + "exp_os": "Ubuntu 22.04.2 LTS", + "exact_match": true + } + } + }, + "result_collators": {}, + "name": "OsPlugin config", + "desc": "Config for testing OsPlugin" +} diff --git a/test/functional/fixtures/package_plugin_config.json b/test/functional/fixtures/package_plugin_config.json new file mode 100644 index 00000000..538bd3a9 --- /dev/null +++ b/test/functional/fixtures/package_plugin_config.json @@ -0,0 +1,20 @@ +{ + "global_args": {}, + "plugins": { + "PackagePlugin": { + "collection_args": { + "rocm_regex": "rocm|hip|hsa|amdgpu", + "enable_rocm_regex": true + }, + "analysis_args": { + "exp_package_ver": { + "gcc": "11.4.0" + }, + "regex_match": false + } + } + }, + "result_collators": {}, + "name": "PackagePlugin config", + "desc": "Config for testing PackagePlugin" +} diff --git a/test/functional/fixtures/pcie_plugin_advanced_config.json b/test/functional/fixtures/pcie_plugin_advanced_config.json new file mode 100644 index 00000000..54812949 --- /dev/null +++ b/test/functional/fixtures/pcie_plugin_advanced_config.json @@ -0,0 +1,28 @@ +{ + "global_args": {}, + "plugins": { + "PciePlugin": { + "analysis_args": { + "exp_speed": 5, + "exp_width": 16, + "exp_sriov_count": 8, + "exp_gpu_count_override": 4, + "exp_max_payload_size": { + "29631": 256, + "29711": 512 + }, + "exp_max_rd_req_size": { + "29631": 512, + "29711": 1024 + }, + "exp_ten_bit_tag_req_en": { + "29631": 1, + "29711": 0 + } + } + } + }, + "result_collators": {}, + "name": "PciePlugin advanced config", + "desc": "Advanced config for testing PciePlugin with device-specific settings" +} diff --git a/test/functional/fixtures/pcie_plugin_config.json b/test/functional/fixtures/pcie_plugin_config.json new file mode 100644 index 00000000..cc78167e --- /dev/null +++ b/test/functional/fixtures/pcie_plugin_config.json @@ -0,0 +1,19 @@ +{ + "global_args": {}, + "plugins": { + "PciePlugin": { + "analysis_args": { + "exp_speed": 5, + "exp_width": 16, + "exp_sriov_count": 8, + "exp_gpu_count_override": 4, + "exp_max_payload_size": 256, + "exp_max_rd_req_size": 512, + "exp_ten_bit_tag_req_en": 1 + } + } + }, + "result_collators": {}, + "name": "PciePlugin config", + "desc": "Config for testing PciePlugin" +} diff --git a/test/functional/fixtures/process_plugin_config.json b/test/functional/fixtures/process_plugin_config.json new file mode 100644 index 00000000..752da786 --- /dev/null +++ b/test/functional/fixtures/process_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "ProcessPlugin": { + "analysis_args": { + "max_kfd_processes": 0, + "max_cpu_usage": 20.0 + } + } + }, + "result_collators": {}, + "name": "ProcessPlugin config", + "desc": "Config for testing ProcessPlugin" +} diff --git a/test/functional/fixtures/rocm_plugin_config.json b/test/functional/fixtures/rocm_plugin_config.json new file mode 100644 index 00000000..95665a6c --- /dev/null +++ b/test/functional/fixtures/rocm_plugin_config.json @@ -0,0 +1,13 @@ +{ + "global_args": {}, + "plugins": { + "RocmPlugin": { + "analysis_args": { + "exp_rocm": "7.0.0-38" + } + } + }, + "result_collators": {}, + "name": "RocmPlugin config", + "desc": "Config for testing RocmPlugin" +} diff --git a/test/functional/fixtures/storage_plugin_config.json b/test/functional/fixtures/storage_plugin_config.json new file mode 100644 index 00000000..16993ad1 --- /dev/null +++ b/test/functional/fixtures/storage_plugin_config.json @@ -0,0 +1,17 @@ +{ + "global_args": {}, + "plugins": { + "StoragePlugin": { + "analysis_args": { + "min_required_free_space_abs": null, + "min_required_free_space_prct": null, + "ignore_devices": [], + "check_devices": [], + "regex_match": false + } + } + }, + "result_collators": {}, + "name": "StoragePlugin config", + "desc": "Config for testing StoragePlugin" +} diff --git a/test/functional/fixtures/sysctl_plugin_config.json b/test/functional/fixtures/sysctl_plugin_config.json new file mode 100644 index 00000000..ee22b38f --- /dev/null +++ b/test/functional/fixtures/sysctl_plugin_config.json @@ -0,0 +1,23 @@ +{ + "global_args": {}, + "plugins": { + "SysctlPlugin": { + "analysis_args": { + "exp_vm_swappiness": null, + "exp_vm_numa_balancing": null, + "exp_vm_oom_kill_allocating_task": null, + "exp_vm_compaction_proactiveness": null, + "exp_vm_compact_unevictable_allowed": null, + "exp_vm_extfrag_threshold": null, + "exp_vm_zone_reclaim_mode": null, + "exp_vm_dirty_background_ratio": null, + "exp_vm_dirty_ratio": null, + "exp_vm_dirty_writeback_centisecs": null, + "exp_kernel_numa_balancing": null + } + } + }, + "result_collators": {}, + "name": "SysctlPlugin config", + "desc": "Config for testing SysctlPlugin" +} diff --git a/test/functional/fixtures/syslog_plugin_config.json b/test/functional/fixtures/syslog_plugin_config.json new file mode 100644 index 00000000..c67888c5 --- /dev/null +++ b/test/functional/fixtures/syslog_plugin_config.json @@ -0,0 +1,9 @@ +{ + "global_args": {}, + "plugins": { + "SyslogPlugin": {} + }, + "result_collators": {}, + "name": "SyslogPlugin config", + "desc": "Config for testing SyslogPlugin" +} diff --git a/test/functional/fixtures/uptime_plugin_config.json b/test/functional/fixtures/uptime_plugin_config.json new file mode 100644 index 00000000..c351525b --- /dev/null +++ b/test/functional/fixtures/uptime_plugin_config.json @@ -0,0 +1,9 @@ +{ + "global_args": {}, + "plugins": { + "UptimePlugin": {} + }, + "result_collators": {}, + "name": "UptimePlugin config", + "desc": "Config for testing UptimePlugin" +} diff --git a/test/functional/test_cli_describe.py b/test/functional/test_cli_describe.py new file mode 100644 index 00000000..52097a54 --- /dev/null +++ b/test/functional/test_cli_describe.py @@ -0,0 +1,55 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for CLI describe command.""" + + +def test_describe_command_list_plugins(run_cli_command): + """Test that describe command can list all plugins.""" + result = run_cli_command(["describe", "plugin"]) + + assert result.returncode == 0 + assert len(result.stdout) > 0 + output = result.stdout.lower() + assert "available plugins" in output or "biosplugin" in output or "kernelplugin" in output + + +def test_describe_command_single_plugin(run_cli_command): + """Test that describe command can describe a single plugin.""" + result = run_cli_command(["describe", "plugin", "BiosPlugin"]) + + assert result.returncode == 0 + assert len(result.stdout) > 0 + output = result.stdout.lower() + assert "bios" in output + + +def test_describe_invalid_plugin(run_cli_command): + """Test that describe command handles invalid plugin gracefully.""" + result = run_cli_command(["describe", "plugin", "NonExistentPlugin"]) + + assert result.returncode != 0 + output = (result.stdout + result.stderr).lower() + assert "error" in output or "not found" in output or "invalid" in output diff --git a/test/functional/test_cli_help.py b/test/functional/test_cli_help.py new file mode 100644 index 00000000..f911eafd --- /dev/null +++ b/test/functional/test_cli_help.py @@ -0,0 +1,83 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for node-scraper CLI help commands.""" + +import subprocess +import sys + + +def test_help_command(): + """Test that node-scraper -h displays help information.""" + result = subprocess.run( + [sys.executable, "-m", "nodescraper.cli.cli", "-h"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "usage:" in result.stdout.lower() + assert "node scraper" in result.stdout.lower() + assert "-h" in result.stdout or "--help" in result.stdout + + +def test_help_command_long_form(): + """Test that node-scraper --help displays help information.""" + result = subprocess.run( + [sys.executable, "-m", "nodescraper.cli.cli", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "usage:" in result.stdout.lower() + assert "node scraper" in result.stdout.lower() + + +def test_no_arguments(): + """Test that node-scraper with no arguments runs the default config.""" + result = subprocess.run( + [sys.executable, "-m", "nodescraper.cli.cli"], + capture_output=True, + text=True, + timeout=30, + ) + + assert len(result.stdout) > 0 or len(result.stderr) > 0 + output = (result.stdout + result.stderr).lower() + assert "plugin" in output or "nodescraper" in output + + +def test_help_shows_subcommands(): + """Test that help output includes available subcommands.""" + result = subprocess.run( + [sys.executable, "-m", "nodescraper.cli.cli", "-h"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + output = result.stdout.lower() + assert "run-plugins" in output or "commands:" in output or "positional arguments:" in output diff --git a/test/functional/test_network_plugin.py b/test/functional/test_network_plugin.py new file mode 100644 index 00000000..27776c8e --- /dev/null +++ b/test/functional/test_network_plugin.py @@ -0,0 +1,106 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for NetworkPlugin with --plugin-configs.""" + +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def network_config_file(fixtures_dir): + """Return path to NetworkPlugin config file.""" + return fixtures_dir / "network_plugin_config.json" + + +def test_network_plugin_with_basic_config(run_cli_command, network_config_file, tmp_path): + """Test NetworkPlugin using basic config file.""" + assert network_config_file.exists(), f"Config file not found: {network_config_file}" + + log_path = str(tmp_path / "logs_network_basic") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(network_config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert "networkplugin" in output.lower() or "network" in output.lower() + + +def test_network_plugin_with_run_plugins_subcommand(run_cli_command, tmp_path): + """Test NetworkPlugin using run-plugins subcommand.""" + log_path = str(tmp_path / "logs_network_subcommand") + result = run_cli_command(["--log-path", log_path, "run-plugins", "NetworkPlugin"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_network_plugin_with_passive_interaction(run_cli_command, network_config_file, tmp_path): + """Test NetworkPlugin with PASSIVE system interaction level.""" + log_path = str(tmp_path / "logs_network_passive") + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + str(network_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_network_plugin_skip_sudo(run_cli_command, network_config_file, tmp_path): + """Test NetworkPlugin with --skip-sudo flag.""" + log_path = str(tmp_path / "logs_network_no_sudo") + result = run_cli_command( + [ + "--log-path", + log_path, + "--skip-sudo", + "--plugin-configs", + str(network_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 diff --git a/test/functional/test_pcie_plugin.py b/test/functional/test_pcie_plugin.py new file mode 100644 index 00000000..9d6c70c9 --- /dev/null +++ b/test/functional/test_pcie_plugin.py @@ -0,0 +1,148 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for PciePlugin with --plugin-configs.""" + +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def pcie_config_file(fixtures_dir): + """Return path to PciePlugin config file.""" + return fixtures_dir / "pcie_plugin_config.json" + + +@pytest.fixture +def pcie_advanced_config_file(fixtures_dir): + """Return path to PciePlugin advanced config file.""" + return fixtures_dir / "pcie_plugin_advanced_config.json" + + +def test_pcie_plugin_with_basic_config(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin using basic config file with integer values.""" + assert pcie_config_file.exists(), f"Config file not found: {pcie_config_file}" + + log_path = str(tmp_path / "logs_pcie_basic") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(pcie_config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert "pcieplugin" in output.lower() or "pcie" in output.lower() + + +def test_pcie_plugin_with_advanced_config(run_cli_command, pcie_advanced_config_file, tmp_path): + """Test PciePlugin using advanced config with device-specific settings.""" + assert pcie_advanced_config_file.exists(), f"Config file not found: {pcie_advanced_config_file}" + + log_path = str(tmp_path / "logs_pcie_advanced") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(pcie_advanced_config_file)], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_with_run_plugins_subcommand(run_cli_command, tmp_path): + """Test PciePlugin using run-plugins subcommand.""" + log_path = str(tmp_path / "logs_pcie_subcommand") + result = run_cli_command(["--log-path", log_path, "run-plugins", "PciePlugin"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_with_passive_interaction(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin with PASSIVE system interaction level.""" + log_path = str(tmp_path / "logs_pcie_passive") + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + str(pcie_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_skip_sudo(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin with --skip-sudo flag.""" + log_path = str(tmp_path / "logs_pcie_no_sudo") + result = run_cli_command( + [ + "--log-path", + log_path, + "--skip-sudo", + "--plugin-configs", + str(pcie_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_combined_configs( + run_cli_command, pcie_config_file, pcie_advanced_config_file, tmp_path +): + """Test PciePlugin with multiple config files.""" + log_path = str(tmp_path / "logs_pcie_combined") + result = run_cli_command( + [ + "--log-path", + log_path, + "--plugin-configs", + str(pcie_config_file), + str(pcie_advanced_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py new file mode 100644 index 00000000..d42382ce --- /dev/null +++ b/test/functional/test_plugin_configs.py @@ -0,0 +1,336 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for --plugin-configs CLI argument.""" + +import json +import os +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def plugin_config_files(fixtures_dir): + """Return dict mapping plugin names to their config file paths.""" + return { + "BiosPlugin": fixtures_dir / "bios_plugin_config.json", + "CmdlinePlugin": fixtures_dir / "cmdline_plugin_config.json", + "DimmPlugin": fixtures_dir / "dimm_plugin_config.json", + "DkmsPlugin": fixtures_dir / "dkms_plugin_config.json", + "DmesgPlugin": fixtures_dir / "dmesg_plugin_config.json", + "JournalPlugin": fixtures_dir / "journal_plugin_config.json", + "KernelPlugin": fixtures_dir / "kernel_plugin_config.json", + "KernelModulePlugin": fixtures_dir / "kernel_module_plugin_config.json", + "MemoryPlugin": fixtures_dir / "memory_plugin_config.json", + "NvmePlugin": fixtures_dir / "nvme_plugin_config.json", + "OsPlugin": fixtures_dir / "os_plugin_config.json", + "PackagePlugin": fixtures_dir / "package_plugin_config.json", + "ProcessPlugin": fixtures_dir / "process_plugin_config.json", + "RocmPlugin": fixtures_dir / "rocm_plugin_config.json", + "StoragePlugin": fixtures_dir / "storage_plugin_config.json", + "SysctlPlugin": fixtures_dir / "sysctl_plugin_config.json", + "SyslogPlugin": fixtures_dir / "syslog_plugin_config.json", + "UptimePlugin": fixtures_dir / "uptime_plugin_config.json", + } + + +@pytest.fixture +def sample_plugin_config(tmp_path): + """Create a sample plugin config JSON file.""" + config = { + "name": "TestConfig", + "desc": "A test configuration", + "global_args": {}, + "plugins": { + "BiosPlugin": {}, + "OsPlugin": {}, + }, + "result_collators": {}, + } + config_file = tmp_path / "test_config.json" + config_file.write_text(json.dumps(config, indent=2)) + return str(config_file) + + +@pytest.fixture +def invalid_plugin_config(tmp_path): + """Create an invalid JSON file.""" + config_file = tmp_path / "invalid_config.json" + config_file.write_text("{ invalid json content") + return str(config_file) + + +def test_plugin_config_with_builtin_config(run_cli_command, tmp_path): + """Test using a built-in config name.""" + log_path = str(tmp_path / "logs_builtin") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", "NodeStatus"], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +@pytest.mark.parametrize( + "plugin_name", + [ + "BiosPlugin", + "CmdlinePlugin", + "DimmPlugin", + "DkmsPlugin", + "DmesgPlugin", + "JournalPlugin", + "KernelPlugin", + "KernelModulePlugin", + "MemoryPlugin", + "NvmePlugin", + "OsPlugin", + "PackagePlugin", + "ProcessPlugin", + "RocmPlugin", + "StoragePlugin", + "SysctlPlugin", + "SyslogPlugin", + "UptimePlugin", + ], +) +def test_individual_plugin_with_config_file( + run_cli_command, plugin_name, plugin_config_files, tmp_path +): + """Test each plugin using its dedicated config file.""" + config_file = plugin_config_files[plugin_name] + + assert config_file.exists(), f"Config file not found: {config_file}" + + log_path = str(tmp_path / f"logs_{plugin_name.lower()}") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + assert plugin_name.lower() in output.lower() or "plugin" in output.lower() + + +def test_plugin_config_with_custom_json_file(run_cli_command, sample_plugin_config, tmp_path): + """Test using a custom JSON config file path.""" + log_path = str(tmp_path / "logs_custom") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", sample_plugin_config], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_plugin_config_with_multiple_configs(run_cli_command, plugin_config_files, tmp_path): + """Test using multiple plugin configs.""" + log_path = str(tmp_path / "logs_multiple") + bios_config = str(plugin_config_files["BiosPlugin"]) + os_config = str(plugin_config_files["OsPlugin"]) + + result = run_cli_command( + [ + "--log-path", + log_path, + "--plugin-configs", + bios_config, + os_config, + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_plugin_config_with_nonexistent_file(run_cli_command, tmp_path): + """Test that a nonexistent config file path fails gracefully.""" + nonexistent_path = str(tmp_path / "nonexistent_config.json") + result = run_cli_command(["--plugin-configs", nonexistent_path], check=False) + + assert result.returncode != 0 + output = (result.stdout + result.stderr).lower() + assert "error" in output or "no plugin config found" in output + + +def test_plugin_config_with_invalid_builtin_name(run_cli_command): + """Test that an invalid built-in config name fails gracefully.""" + result = run_cli_command(["--plugin-configs", "NonExistentConfig"], check=False) + + assert result.returncode != 0 + output = (result.stdout + result.stderr).lower() + assert "error" in output or "no plugin config found" in output + + +def test_plugin_config_with_invalid_json(run_cli_command, invalid_plugin_config): + """Test that an invalid JSON file fails gracefully.""" + result = run_cli_command(["--plugin-configs", invalid_plugin_config], check=False) + + assert result.returncode != 0 + output = (result.stdout + result.stderr).lower() + assert "error" in output or "invalid" in output or "json" in output + + +def test_plugin_config_empty_list(run_cli_command, tmp_path): + """Test --plugin-configs with no arguments (uses default config).""" + log_path = str(tmp_path / "logs_empty") + result = run_cli_command(["--log-path", log_path, "--plugin-configs"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_plugin_config_with_system_interaction_level( + run_cli_command, plugin_config_files, tmp_path +): + """Test plugin config with different system interaction levels.""" + log_path = str(tmp_path / "logs_passive") + config_file = str(plugin_config_files["UptimePlugin"]) + + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + config_file, + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_plugin_config_combined_with_run_plugins(run_cli_command, plugin_config_files, tmp_path): + """Test that plugin config can be combined with run-plugins subcommand.""" + log_path = str(tmp_path / "logs_combined") + config_file = str(plugin_config_files["MemoryPlugin"]) + + result = run_cli_command( + [ + "--log-path", + log_path, + "--plugin-configs", + config_file, + "run-plugins", + "UptimePlugin", + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_plugin_config_verify_log_output(run_cli_command, plugin_config_files, tmp_path): + """Test that plugin config execution creates expected log outputs.""" + log_path = str(tmp_path / "logs_verify") + config_file = str(plugin_config_files["OsPlugin"]) + + result = run_cli_command(["--log-path", log_path, "--plugin-configs", config_file], check=False) + + log_dirs = [d for d in os.listdir(tmp_path) if d.startswith("logs_verify")] + if result.returncode in [0, 1]: + assert len(log_dirs) > 0 + + +def test_all_plugin_config_files_exist(plugin_config_files): + """Verify all plugin config fixture files exist.""" + for plugin_name, config_file in plugin_config_files.items(): + assert config_file.exists(), f"Missing config file for {plugin_name}: {config_file}" + + with open(config_file) as f: + config = json.load(f) + assert "plugins" in config + assert plugin_name in config["plugins"] + + +def test_dmesg_plugin_log_dmesg_data_false(run_cli_command, tmp_path): + """Test DmesgPlugin with log_dmesg_data=false doesn't write dmesg.log.""" + config = { + "name": "DmesgNoLogConfig", + "desc": "DmesgPlugin config with log_dmesg_data disabled", + "global_args": {}, + "plugins": {"DmesgPlugin": {"collection_args": {"log_dmesg_data": False}}}, + "result_collators": {}, + } + config_file = tmp_path / "dmesg_no_log_config.json" + config_file.write_text(json.dumps(config, indent=2)) + + log_path = str(tmp_path / "logs_dmesg_no_log") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + + dmesg_plugin_dir = Path(log_path) / "dmesg_plugin" / "dmesg_collector" + if dmesg_plugin_dir.exists(): + dmesg_log_files = list(dmesg_plugin_dir.glob("dmesg*.log")) + assert ( + len(dmesg_log_files) == 0 + ), f"Found dmesg log files when log_dmesg_data=False: {dmesg_log_files}" + + +def test_dmesg_plugin_log_dmesg_data_true(run_cli_command, tmp_path): + """Test DmesgPlugin with log_dmesg_data=true writes dmesg.log.""" + config = { + "name": "DmesgWithLogConfig", + "desc": "DmesgPlugin config with log_dmesg_data enabled", + "global_args": {}, + "plugins": {"DmesgPlugin": {"collection_args": {"log_dmesg_data": True}}}, + "result_collators": {}, + } + config_file = tmp_path / "dmesg_with_log_config.json" + config_file.write_text(json.dumps(config, indent=2)) + + log_path = str(tmp_path / "logs_dmesg_with_log") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + if result.returncode in [0, 1]: + dmesg_plugin_dir = Path(log_path) / "dmesg_plugin" / "dmesg_collector" + if dmesg_plugin_dir.exists(): + dmesg_log_files = list(dmesg_plugin_dir.glob("dmesg*.log")) + assert len(dmesg_log_files) > 0, "Expected dmesg.log file when log_dmesg_data=True" diff --git a/test/functional/test_plugin_registry.py b/test/functional/test_plugin_registry.py new file mode 100644 index 00000000..77d352f7 --- /dev/null +++ b/test/functional/test_plugin_registry.py @@ -0,0 +1,75 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for plugin registry and plugin loading.""" + +import inspect + +from nodescraper.pluginregistry import PluginRegistry + + +def test_plugin_registry_loads_plugins(): + """Test that PluginRegistry successfully loads built-in plugins.""" + registry = PluginRegistry() + + assert len(registry.plugins) > 0 + plugin_names = [name.lower() for name in registry.plugins.keys()] + expected_plugins = ["biosplugin", "kernelplugin", "osplugin"] + + for expected in expected_plugins: + assert expected in plugin_names + + +def test_plugin_registry_has_connection_managers(): + """Test that PluginRegistry loads connection managers.""" + registry = PluginRegistry() + + assert len(registry.connection_managers) > 0 + conn_names = [name.lower() for name in registry.connection_managers.keys()] + assert "inbandconnectionmanager" in conn_names + + +def test_plugin_registry_list_plugins(): + """Test that PluginRegistry stores plugins in a dictionary.""" + registry = PluginRegistry() + plugin_dict = registry.plugins + + assert isinstance(plugin_dict, dict) + assert len(plugin_dict) > 0 + assert all(isinstance(name, str) for name in plugin_dict.keys()) + assert all(inspect.isclass(cls) for cls in plugin_dict.values()) + + +def test_plugin_registry_get_plugin(): + """Test that PluginRegistry can retrieve a specific plugin.""" + registry = PluginRegistry() + plugin_names = list(registry.plugins.keys()) + assert len(plugin_names) > 0 + + first_plugin_name = plugin_names[0] + plugin = registry.plugins[first_plugin_name] + + assert plugin is not None + assert hasattr(plugin, "run") diff --git a/test/functional/test_reference_config_workflow.py b/test/functional/test_reference_config_workflow.py new file mode 100644 index 00000000..44362149 --- /dev/null +++ b/test/functional/test_reference_config_workflow.py @@ -0,0 +1,345 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +""" +Functional tests for reference config generation and usage workflow. + +Tests the complete workflow: +1. Generate reference config from system using --gen-reference-config +2. Use the generated config with --plugin-configs +""" +import json +from pathlib import Path + +import pytest + +from nodescraper.pluginregistry import PluginRegistry + + +def find_reference_config(log_path): + """Find reference_config.json in timestamped log directory. + + Args: + log_path: Base log path where logs are stored + + Returns: + Path to reference_config.json or None if not found + """ + log_path = Path(log_path) + if not log_path.exists(): + return None + + log_dirs = list(log_path.glob("scraper_logs_*")) + if not log_dirs: + return None + + most_recent = max(log_dirs, key=lambda p: p.stat().st_mtime) + + reference_config = most_recent / "reference_config.json" + if reference_config.exists(): + return reference_config + + return None + + +@pytest.fixture(scope="module") +def all_plugin_names(): + """Get list of all available plugin names.""" + registry = PluginRegistry() + return sorted(registry.plugins.keys()) + + +def test_gen_reference_config_all_plugins(run_cli_command, tmp_path, all_plugin_names): + """Test generating reference config with all plugins via run-plugins subcommand. + + Note: When running all plugins, some may fail but as long as at least one succeeds, + the reference config should be generated. + """ + log_path = str(tmp_path / "logs_gen_ref_all") + + result = run_cli_command( + [ + "--log-path", + log_path, + "--gen-reference-config", + "run-plugins", + ] + + all_plugin_names, + check=False, + ) + + assert result.returncode in [0, 1, 2, 120], ( + f"Unexpected return code: {result.returncode}\n" + f"stdout: {result.stdout[:500]}\nstderr: {result.stderr[:500]}" + ) + + reference_config_path = find_reference_config(log_path) + + if reference_config_path is None: + pytest.skip( + "reference_config.json was not created - likely all plugins failed or timed out. " + "This can happen in test environments." + ) + + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + assert "plugins" in config + assert isinstance(config["plugins"], dict) + assert len(config["plugins"]) > 0 + + +def test_gen_reference_config_subset_plugins(run_cli_command, tmp_path): + """Test generating reference config with a subset of plugins.""" + log_path = str(tmp_path / "logs_gen_ref_subset") + plugins = ["BiosPlugin", "OsPlugin", "KernelPlugin"] + + result = run_cli_command( + ["--log-path", log_path, "--gen-reference-config", "run-plugins"] + plugins, + check=False, + ) + + assert result.returncode in [0, 1, 2] + + reference_config_path = find_reference_config(log_path) + assert reference_config_path is not None, "reference_config.json was not created" + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + assert "plugins" in config + + +def test_use_generated_reference_config(run_cli_command, tmp_path): + """Test using a generated reference config with --plugin-configs.""" + gen_log_path = str(tmp_path / "logs_gen") + use_log_path = str(tmp_path / "logs_use") + + plugins = ["BiosPlugin", "OsPlugin", "UptimePlugin"] + + gen_result = run_cli_command( + ["--log-path", gen_log_path, "--gen-reference-config", "run-plugins"] + plugins, + check=False, + ) + + assert gen_result.returncode in [0, 1, 2] + + reference_config_path = find_reference_config(gen_log_path) + assert reference_config_path is not None, "reference_config.json was not created" + assert reference_config_path.exists() + + use_result = run_cli_command( + ["--log-path", use_log_path, "--plugin-configs", str(reference_config_path)], + check=False, + ) + + assert use_result.returncode in [0, 1, 2] + output = use_result.stdout + use_result.stderr + assert len(output) > 0 + + +def test_full_workflow_all_plugins(run_cli_command, tmp_path, all_plugin_names): + """ + Test complete workflow: generate reference config from all plugins, + then use it with --plugin-configs. + + Note: May skip if plugins fail to generate config in test environment. + """ + gen_log_path = str(tmp_path / "logs_gen_workflow") + use_log_path = str(tmp_path / "logs_use_workflow") + + gen_result = run_cli_command( + [ + "--log-path", + gen_log_path, + "--gen-reference-config", + "run-plugins", + ] + + all_plugin_names, + check=False, + ) + + assert gen_result.returncode in [0, 1, 2, 120], ( + f"Generation failed with return code {gen_result.returncode}\n" + f"stdout: {gen_result.stdout[:500]}\n" + f"stderr: {gen_result.stderr[:500]}" + ) + + reference_config_path = find_reference_config(gen_log_path) + + if reference_config_path is None: + pytest.skip( + "reference_config.json was not generated - plugins may have failed in test environment" + ) + + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + assert "plugins" in config, "Config missing 'plugins' key" + + for _plugin_name, plugin_config in config["plugins"].items(): + if "analysis_args" in plugin_config: + assert isinstance(plugin_config["analysis_args"], dict) + + use_result = run_cli_command( + ["--log-path", use_log_path, "--plugin-configs", str(reference_config_path)], + check=False, + ) + + assert use_result.returncode in [0, 1, 2], ( + f"Using config failed with return code {use_result.returncode}\n" + f"stdout: {use_result.stdout}\n" + f"stderr: {use_result.stderr}" + ) + + output = use_result.stdout + use_result.stderr + assert len(output) > 0, "No output generated when using reference config" + + use_log_dirs = list(Path(tmp_path).glob("logs_use_workflow*")) + assert len(use_log_dirs) > 0, "No log directory created when using config" + + +def test_reference_config_with_analysis_args(run_cli_command, tmp_path): + """Test that generated reference config includes analysis_args where available.""" + log_path = str(tmp_path / "logs_analysis_args") + + plugins_with_build_from_model = [ + "BiosPlugin", + "CmdlinePlugin", + "DeviceEnumerationPlugin", + "DkmsPlugin", + "KernelPlugin", + "KernelModulePlugin", + "MemoryPlugin", + "OsPlugin", + "PackagePlugin", + "ProcessPlugin", + "RocmPlugin", + "SysctlPlugin", + ] + + result = run_cli_command( + ["--log-path", log_path, "--gen-reference-config", "run-plugins"] + + plugins_with_build_from_model, + check=False, + ) + + assert result.returncode in [0, 1, 2, 120] + + reference_config_path = find_reference_config(log_path) + + if reference_config_path is None: + pytest.skip( + "reference_config.json was not created - plugins may have failed in test environment" + ) + + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + plugins_with_args = [ + name for name, conf in config["plugins"].items() if "analysis_args" in conf + ] + assert len(plugins_with_args) > 0, "No plugins have analysis_args in generated config" + + +def test_reference_config_structure(run_cli_command, tmp_path): + """Test that generated reference config has correct structure.""" + log_path = str(tmp_path / "logs_structure") + + result = run_cli_command( + ["--log-path", log_path, "--gen-reference-config", "run-plugins", "OsPlugin"], + check=False, + ) + + assert result.returncode in [0, 1, 2] + + reference_config_path = find_reference_config(log_path) + assert reference_config_path is not None, "reference_config.json was not created" + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + + assert "plugins" in config + assert isinstance(config["plugins"], dict) + + if "OsPlugin" in config["plugins"]: + os_config = config["plugins"]["OsPlugin"] + if "analysis_args" in os_config: + assert "exp_os" in os_config["analysis_args"] + + +def test_gen_reference_config_without_run_plugins(run_cli_command, tmp_path): + """Test generating reference config without specifying plugins (uses default).""" + log_path = str(tmp_path / "logs_default") + + result = run_cli_command( + ["--log-path", log_path, "--gen-reference-config"], + check=False, + ) + + assert result.returncode in [0, 1, 2] + + reference_config_path = find_reference_config(log_path) + assert reference_config_path is not None, "reference_config.json was not created" + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + assert "plugins" in config + + +def test_reference_config_json_valid(run_cli_command, tmp_path): + """Test that generated reference config is valid JSON.""" + log_path = str(tmp_path / "logs_valid_json") + + result = run_cli_command( + [ + "--log-path", + log_path, + "--gen-reference-config", + "run-plugins", + "BiosPlugin", + "OsPlugin", + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + + reference_config_path = find_reference_config(log_path) + assert reference_config_path is not None, "reference_config.json was not created" + assert reference_config_path.exists() + + with open(reference_config_path) as f: + config = json.load(f) + json_str = json.dumps(config, indent=2) + assert len(json_str) > 0 + + reparsed = json.loads(json_str) + assert reparsed == config diff --git a/test/functional/test_run_plugins.py b/test/functional/test_run_plugins.py new file mode 100644 index 00000000..0253784e --- /dev/null +++ b/test/functional/test_run_plugins.py @@ -0,0 +1,116 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for running individual plugins.""" + +import pytest + +from nodescraper.pluginregistry import PluginRegistry + + +@pytest.fixture(scope="module") +def all_plugins(): + """Get list of all available plugin names.""" + registry = PluginRegistry() + return sorted(registry.plugins.keys()) + + +def test_plugin_registry_has_plugins(all_plugins): + """Verify that plugins are available for testing.""" + assert len(all_plugins) > 0 + + +@pytest.mark.parametrize( + "plugin_name", + [ + "BiosPlugin", + "CmdlinePlugin", + "DimmPlugin", + "DkmsPlugin", + "DmesgPlugin", + "JournalPlugin", + "KernelPlugin", + "KernelModulePlugin", + "MemoryPlugin", + "NetworkPlugin", + "NvmePlugin", + "OsPlugin", + "PackagePlugin", + "ProcessPlugin", + "RocmPlugin", + "StoragePlugin", + "SysctlPlugin", + "SyslogPlugin", + "UptimePlugin", + ], +) +def test_run_individual_plugin(run_cli_command, plugin_name, tmp_path): + """Test running each plugin individually.""" + log_path = str(tmp_path / f"logs_{plugin_name}") + result = run_cli_command(["--log-path", log_path, "run-plugins", plugin_name], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert plugin_name.lower() in output.lower() + + +def test_run_all_plugins_together(run_cli_command, all_plugins, tmp_path): + """Test running all plugins together.""" + plugins_to_run = all_plugins[:3] + log_path = str(tmp_path / "logs_multiple") + result = run_cli_command(["--log-path", log_path, "run-plugins"] + plugins_to_run, check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_run_plugin_with_invalid_name(run_cli_command): + """Test that running a non-existent plugin logs a warning and falls back to default config.""" + result = run_cli_command(["run-plugins", "NonExistentPlugin"], check=False) + + # Invalid plugin is ignored and default config runs instead + # Exit code depends on whether default config plugins succeed + output = result.stdout + result.stderr + # Check that warning was logged for invalid plugin + assert "Invalid plugin name(s) ignored: NonExistentPlugin" in output + # Check that default config was used + assert "running default config" in output.lower() or "NodeStatus" in output + # Verify it didn't crash + assert "Data written to csv file" in output + + +def test_run_comma_separated_plugins_with_invalid(run_cli_command): + """Test that comma-separated plugins run valid ones and ignore invalid ones.""" + result = run_cli_command(["run-plugins", "AmdSmiPlugin,SomePlugin"], check=False) + + output = result.stdout + result.stderr + # Check that warning was logged for invalid plugin + assert "Invalid plugin name(s) ignored: SomePlugin" in output + # Check that AmdSmiPlugin actually ran + assert "Running plugin AmdSmiPlugin" in output + # Verify it didn't crash + assert "Data written to csv file" in output diff --git a/test/unit/framework/test_analyzerargs.py b/test/unit/framework/test_analyzerargs.py index 3642a2b6..b2c8e996 100644 --- a/test/unit/framework/test_analyzerargs.py +++ b/test/unit/framework/test_analyzerargs.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + import pytest from nodescraper.models import AnalyzerArgs diff --git a/test/unit/framework/test_cli.py b/test/unit/framework/test_cli.py index 79aca013..cd266ed9 100644 --- a/test/unit/framework/test_cli.py +++ b/test/unit/framework/test_cli.py @@ -115,12 +115,12 @@ def test_system_info_builder(): ( ["--sys-name", "test-sys", "--sys-sku", "test-sku"], ["TestPlugin1", "TestPlugin2"], - (["--sys-name", "test-sys", "--sys-sku", "test-sku"], {}), + (["--sys-name", "test-sys", "--sys-sku", "test-sku"], {}, []), ), ( ["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], ["TestPlugin1", "TestPlugin2"], - (["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], {}), + (["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], {}, []), ), ( [ @@ -143,6 +143,7 @@ def test_system_info_builder(): "TestPlugin1": ["--plugin1_arg", "test-val1"], "TestPlugin2": ["--plugin2_arg", "test-val2"], }, + [], ), ), ], diff --git a/test/unit/framework/test_file_artifact.py b/test/unit/framework/test_file_artifact.py index b6741d20..991fbf48 100644 --- a/test/unit/framework/test_file_artifact.py +++ b/test/unit/framework/test_file_artifact.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + from pathlib import Path from nodescraper.connection.inband.inband import ( diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py new file mode 100644 index 00000000..af7ab0f9 --- /dev/null +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -0,0 +1,803 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import pytest + +from nodescraper.enums import EventPriority +from nodescraper.plugins.inband.amdsmi.amdsmi_analyzer import AmdSmiAnalyzer +from nodescraper.plugins.inband.amdsmi.amdsmidata import ( + AmdSmiDataModel, + AmdSmiStatic, + AmdSmiTstData, + AmdSmiVersion, + EccState, + Fw, + FwListItem, + Partition, + PartitionCompute, + PartitionMemory, + Processes, + ProcessInfo, + ProcessListItem, + ProcessMemoryUsage, + ProcessUsage, + StaticAsic, + StaticBoard, + StaticBus, + StaticDriver, + StaticLimit, + StaticNuma, + StaticRas, + StaticVram, + ValueUnit, + XgmiLinkMetrics, + XgmiMetrics, +) +from nodescraper.plugins.inband.amdsmi.analyzer_args import AmdSmiAnalyzerArgs + + +@pytest.fixture +def mock_value_unit(): + """Factory fixture to create mock ValueUnit objects.""" + + def _create(value, unit): + return ValueUnit(value=value, unit=unit) + + return _create + + +@pytest.fixture +def mock_static_asic(): + """Create a mock StaticAsic object.""" + return StaticAsic( + market_name="AMD Instinct MI123", + vendor_id="0x1234", + vendor_name="Advanced Micro Devices Inc", + subvendor_id="0x1234", + device_id="0x12a0", + subsystem_id="0x0c12", + rev_id="0x00", + asic_serial="", + oam_id=0, + num_compute_units=111, + target_graphics_version="gfx123", + ) + + +@pytest.fixture +def mock_static_bus(mock_value_unit): + """Create a mock StaticBus object.""" + return StaticBus( + bdf="0000:01:00.0", + max_pcie_width=mock_value_unit(16, "x"), + max_pcie_speed=mock_value_unit(32, "GT/s"), + pcie_interface_version="Gen5", + slot_type="OAM", + ) + + +@pytest.fixture +def mock_static_limit(mock_value_unit): + """Create a mock StaticLimit object.""" + return StaticLimit( + max_power=mock_value_unit(550.0, "W"), + min_power=mock_value_unit(0, "W"), + socket_power=mock_value_unit(0, "W"), + slowdown_edge_temperature=mock_value_unit(100, "C"), + slowdown_hotspot_temperature=mock_value_unit(110, "C"), + slowdown_vram_temperature=mock_value_unit(95, "C"), + shutdown_edge_temperature=mock_value_unit(105, "C"), + shutdown_hotspot_temperature=mock_value_unit(115, "C"), + shutdown_vram_temperature=mock_value_unit(100, "C"), + ) + + +@pytest.fixture +def mock_static_driver(): + """Create a mock StaticDriver object.""" + return StaticDriver(name="amdgpu", version="1.2.3") + + +@pytest.fixture +def mock_static_board(): + """Create a mock StaticBoard object.""" + return StaticBoard( + model_number="", + product_serial="", + fru_id="", + product_name="", + manufacturer_name="", + ) + + +@pytest.fixture +def mock_static_numa(): + """Create a mock StaticNuma object.""" + return StaticNuma(node=0, affinity=0) + + +@pytest.fixture +def mock_static_vram(mock_value_unit): + """Create a mock StaticVram object.""" + return StaticVram( + type="sometype", + vendor="Some vendor", + size=mock_value_unit(192, "GB"), + bit_width=mock_value_unit(8192, "bit"), + max_bandwidth=None, + ) + + +@pytest.fixture +def mock_analyzer(system_info): + """Create a mock AmdSmiAnalyzer instance.""" + return AmdSmiAnalyzer(system_info) + + +def create_static_gpu( + gpu_id: int = 0, + max_power: float = 550.0, + driver_version: str = "1.2.3", + vendor_id: str = "0x1234", + subvendor_id: str = "0x1234", + device_id: str = "0x12a0", + subsystem_id: str = "0x0c12", + market_name: str = "AMD Instinct MI123", +) -> AmdSmiStatic: + """Helper function to create a mock AmdSmiStatic object for testing.""" + return AmdSmiStatic( + gpu=gpu_id, + asic=StaticAsic( + market_name=market_name, + vendor_id=vendor_id, + vendor_name="Advanced Micro Devices Inc", + subvendor_id=subvendor_id, + device_id=device_id, + subsystem_id=subsystem_id, + rev_id="0x00", + asic_serial="", + oam_id=0, + num_compute_units=111, + target_graphics_version="gfx123", + ), + bus=StaticBus( + bdf="0000:01:00.0", + max_pcie_width=ValueUnit(value=16, unit="x"), + max_pcie_speed=ValueUnit(value=32, unit="GT/s"), + pcie_interface_version="Gen5", + slot_type="OAM", + ), + vbios=None, + limit=StaticLimit( + max_power=ValueUnit(value=max_power, unit="W"), + min_power=ValueUnit(value=0, unit="W"), + socket_power=ValueUnit(value=0, unit="W"), + slowdown_edge_temperature=ValueUnit(value=100, unit="C"), + slowdown_hotspot_temperature=ValueUnit(value=110, unit="C"), + slowdown_vram_temperature=ValueUnit(value=95, unit="C"), + shutdown_edge_temperature=ValueUnit(value=105, unit="C"), + shutdown_hotspot_temperature=ValueUnit(value=115, unit="C"), + shutdown_vram_temperature=ValueUnit(value=100, unit="C"), + ), + driver=StaticDriver(name="amdgpu", version=driver_version), + board=StaticBoard( + model_number="", + product_serial="", + fru_id="", + product_name="", + manufacturer_name="", + ), + ras=StaticRas( + eeprom_version="1.0", + parity_schema=EccState.ENABLED, + single_bit_schema=EccState.ENABLED, + double_bit_schema=EccState.ENABLED, + poison_schema=EccState.ENABLED, + ecc_block_state={}, + ), + soc_pstate=None, + xgmi_plpd=None, + process_isolation="NONE", + numa=StaticNuma(node=0, affinity=0), + vram=StaticVram( + type="sometype", + vendor="Some vendor", + size=ValueUnit(value=192, unit="GB"), + bit_width=ValueUnit(value=8192, unit="bit"), + max_bandwidth=None, + ), + cache_info=[], + partition=None, + clock=None, + ) + + +def test_check_expected_max_power_success(mock_analyzer): + """Test check_expected_max_power passes when all GPUs have correct max power.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, max_power=550.0), + create_static_gpu(1, max_power=550.0), + ] + + analyzer.check_expected_max_power(static_data, 550) + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_max_power_mismatch(mock_analyzer): + """Test check_expected_max_power logs error when GPU max power doesn't match.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, max_power=550.0), + create_static_gpu(1, max_power=450.0), + ] + + analyzer.check_expected_max_power(static_data, 550) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "PLATFORM" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "Max power mismatch" in analyzer.result.events[0].description + + +def test_check_expected_max_power_missing(mock_analyzer): + """Test check_expected_max_power handles missing max_power gracefully.""" + analyzer = mock_analyzer + + gpu_no_limit = create_static_gpu(0, max_power=550.0) + gpu_no_limit.limit = None + + static_data = [gpu_no_limit] + + analyzer.check_expected_max_power(static_data, 550) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + assert "has no max power limit set" in analyzer.result.events[0].description + + +def test_check_expected_driver_version_success(mock_analyzer): + """Test check_expected_driver_version passes when all GPUs have correct driver.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, driver_version="1.2.3"), + create_static_gpu(1, driver_version="1.2.3"), + ] + + analyzer.check_expected_driver_version(static_data, "1.2.3") + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_driver_version_mismatch(mock_analyzer): + """Test check_expected_driver_version logs error when driver versions don't match.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, driver_version="1.2.3"), + create_static_gpu(1, driver_version="6.7.0"), + ] + + analyzer.check_expected_driver_version(static_data, "1.2.3") + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "PLATFORM" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "Driver Version Mismatch" in analyzer.result.events[0].description + + +def test_expected_gpu_processes_success(mock_analyzer): + """Test expected_gpu_processes passes when process count is below threshold.""" + analyzer = mock_analyzer + + processes_data = [ + Processes( + gpu=0, + process_list=[ + ProcessListItem( + process_info=ProcessInfo( + name="test_process", + pid=1234, + memory_usage=ProcessMemoryUsage(gtt_mem=None, cpu_mem=None, vram_mem=None), + mem_usage=None, + usage=ProcessUsage(gfx=None, enc=None), + ) + ), + ProcessListItem( + process_info=ProcessInfo( + name="test_process2", + pid=5678, + memory_usage=ProcessMemoryUsage(gtt_mem=None, cpu_mem=None, vram_mem=None), + mem_usage=None, + usage=ProcessUsage(gfx=None, enc=None), + ) + ), + ], + ), + ] + + analyzer.expected_gpu_processes(processes_data, 5) + + assert len(analyzer.result.events) == 0 + + +def test_expected_gpu_processes_exceeds(mock_analyzer): + """Test expected_gpu_processes logs error when process count exceeds threshold.""" + analyzer = mock_analyzer + + processes_data = [ + Processes( + gpu=0, + process_list=[ + ProcessListItem( + process_info=ProcessInfo( + name=f"process_{i}", + pid=i, + memory_usage=ProcessMemoryUsage(gtt_mem=None, cpu_mem=None, vram_mem=None), + mem_usage=None, + usage=ProcessUsage(gfx=None, enc=None), + ) + ) + for i in range(10) + ], + ), + ] + + analyzer.expected_gpu_processes(processes_data, 5) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "Number of processes exceeds max processes" in analyzer.result.events[0].description + + +def test_expected_gpu_processes_no_data(mock_analyzer): + """Test expected_gpu_processes handles missing process data.""" + analyzer = mock_analyzer + + analyzer.expected_gpu_processes(None, 5) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + assert "No GPU processes data available" in analyzer.result.events[0].description + + +def test_static_consistancy_check_success(mock_analyzer): + """Test static_consistancy_check passes when all GPUs have consistent data.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0), + create_static_gpu(1), + ] + + analyzer.static_consistancy_check(static_data) + + assert len(analyzer.result.events) == 0 + + +def test_static_consistancy_check_inconsistent(mock_analyzer): + """Test static_consistancy_check logs warning when GPU data is inconsistent.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, vendor_id="0x1234"), + create_static_gpu(1, vendor_id="0x1003"), + ] + + analyzer.static_consistancy_check(static_data) + + assert len(analyzer.result.events) >= 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + + +def test_check_static_data_success(mock_analyzer): + """Test check_static_data passes when all GPUs match expected configuration.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0), + ] + + analyzer.check_static_data( + static_data, + vendor_id="0x1234", + subvendor_id="0x1234", + device_id=("0x12a0", "0x12a0"), + subsystem_id=("0x0c12", "0x0c12"), + sku_name="AMD Instinct MI123", + ) + + assert len(analyzer.result.events) == 0 + + +def test_check_static_data_mismatch(mock_analyzer): + """Test check_static_data logs error when GPU configuration doesn't match.""" + analyzer = mock_analyzer + + static_data = [ + create_static_gpu(0, device_id="0x74a1"), + ] + + analyzer.check_static_data( + static_data, + vendor_id="0x1234", + subvendor_id="0x1234", + device_id=("0x12a0", "0x12a0"), + subsystem_id=("0x0c12", "0x0c12"), + sku_name="AMD Instinct MI123", + ) + + assert len(analyzer.result.events) >= 1 + + +def test_check_pldm_version_success(mock_analyzer): + """Test check_pldm_version passes when PLDM version matches.""" + analyzer = mock_analyzer + + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + ], + ), + ] + + analyzer.check_pldm_version(firmware_data, "1.2.3") + + assert len(analyzer.result.events) == 0 + + +def test_check_pldm_version_mismatch(mock_analyzer): + """Test check_pldm_version logs error when PLDM version doesn't match.""" + analyzer = mock_analyzer + + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + ], + ), + ] + + analyzer.check_pldm_version(firmware_data, "1.2.4") + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + + +def test_check_pldm_version_missing(mock_analyzer): + """Test check_pldm_version handles missing PLDM firmware.""" + analyzer = mock_analyzer + + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="OTHER_FW", fw_version="1.0.0"), + ], + ), + ] + + analyzer.check_pldm_version(firmware_data, "1.2.3") + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + + +def test_check_expected_memory_partition_mode_success(mock_analyzer): + """Test check_expected_memory_partition_mode passes when partition modes match.""" + analyzer = mock_analyzer + + partition_data = Partition( + memory_partition=[ + PartitionMemory(gpu_id=0, partition_type="NPS1"), + PartitionMemory(gpu_id=1, partition_type="NPS1"), + ], + compute_partition=[ + PartitionCompute(gpu_id=0, partition_type="SPX"), + PartitionCompute(gpu_id=1, partition_type="SPX"), + ], + ) + + analyzer.check_expected_memory_partition_mode(partition_data, "NPS1", "SPX") + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_memory_partition_mode_mismatch(mock_analyzer): + """Test check_expected_memory_partition_mode logs error when modes don't match.""" + analyzer = mock_analyzer + + partition_data = Partition( + memory_partition=[ + PartitionMemory(gpu_id=0, partition_type="NPS1"), + PartitionMemory(gpu_id=1, partition_type="NPS4"), + ], + compute_partition=[ + PartitionCompute(gpu_id=0, partition_type="SPX"), + PartitionCompute(gpu_id=1, partition_type="SPX"), + ], + ) + + analyzer.check_expected_memory_partition_mode(partition_data, "NPS1", "SPX") + + assert len(analyzer.result.events) >= 0 + + +def test_check_expected_xgmi_link_speed_success(mock_analyzer): + """Test check_expected_xgmi_link_speed passes when XGMI speed matches.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + XgmiMetrics( + gpu=1, + bdf="0000:02:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_xgmi_link_speed_mismatch(mock_analyzer): + """Test check_expected_xgmi_link_speed logs error when speed doesn't match.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=25.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "IO" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "XGMI link speed is not as expected" in analyzer.result.events[0].description + + +def test_check_expected_xgmi_link_speed_multiple_valid_speeds(mock_analyzer): + """Test check_expected_xgmi_link_speed with multiple valid speeds.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=36.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + XgmiMetrics( + gpu=1, + bdf="0000:02:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=38.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[36.0, 38.0]) + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_xgmi_link_speed_no_data(mock_analyzer): + """Test check_expected_xgmi_link_speed handles missing XGMI data.""" + analyzer = mock_analyzer + + analyzer.check_expected_xgmi_link_speed(None, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + assert "XGMI link speed data is not available" in analyzer.result.events[0].description + + +def test_check_expected_xgmi_link_speed_missing_bit_rate(mock_analyzer): + """Test check_expected_xgmi_link_speed handles missing bit rate value.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=None, + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "XGMI link speed is not available" in analyzer.result.events[0].description + + +def test_check_amdsmitst_success(mock_analyzer): + """Test check_amdsmitst passes when no tests failed.""" + analyzer = mock_analyzer + + tst_data = AmdSmiTstData( + passed_tests=["test1", "test2", "test3"], + skipped_tests=[], + failed_tests=[], + failed_test_count=0, + ) + + analyzer.check_amdsmitst(tst_data) + + assert len(analyzer.result.events) == 0 + + +def test_check_amdsmitst_failures(mock_analyzer): + """Test check_amdsmitst logs error when tests failed.""" + analyzer = mock_analyzer + + tst_data = AmdSmiTstData( + passed_tests=["test1", "test2"], + skipped_tests=["test3"], + failed_tests=["test4", "test5"], + failed_test_count=2, + ) + + analyzer.check_amdsmitst(tst_data) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "APPLICATION" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "2 failed tests running amdsmitst" in analyzer.result.events[0].description + assert analyzer.result.events[0].data["failed_test_count"] == 2 + assert analyzer.result.events[0].data["failed_tests"] == ["test4", "test5"] + + +def test_analyze_data_full_workflow(mock_analyzer): + """Test full analyze_data workflow with various checks.""" + analyzer = mock_analyzer + + data = AmdSmiDataModel( + version=AmdSmiVersion( + tool="amdsmi", + version="1.2.3", + amdsmi_library_version="1.2.3", + rocm_version="6.1.0", + ), + static=[ + create_static_gpu(0, max_power=550.0, driver_version="1.2.3"), + create_static_gpu(1, max_power=550.0, driver_version="1.2.3"), + ], + process=[ + Processes( + gpu=0, + process_list=[ + ProcessListItem( + process_info=ProcessInfo( + name="test", + pid=1234, + memory_usage=ProcessMemoryUsage( + gtt_mem=None, cpu_mem=None, vram_mem=None + ), + mem_usage=None, + usage=ProcessUsage(gfx=None, enc=None), + ) + ), + ], + ), + ], + firmware=[ + Fw(gpu=0, fw_list=[FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3")]), + ], + partition=None, + gpu_list=None, + xgmi_metric=[ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ], + amdsmitst_data=AmdSmiTstData( + passed_tests=["test1", "test2"], + skipped_tests=[], + failed_tests=[], + failed_test_count=0, + ), + ) + + args = AmdSmiAnalyzerArgs( + expected_max_power=550, + expected_driver_version="1.2.3", + expected_gpu_processes=10, + expected_xgmi_speed=[32.0], + ) + + result = analyzer.analyze_data(data, args) + + assert len(result.events) == 0 + + +def test_analyze_data_no_static_data(mock_analyzer): + """Test analyze_data when no static data is available.""" + analyzer = mock_analyzer + + data = AmdSmiDataModel( + version=None, + static=None, + process=None, + firmware=None, + partition=None, + gpu_list=None, + ) + + result = analyzer.analyze_data(data, None) + + assert len(result.events) >= 1 + assert any("No AMD SMI static data available" in event.description for event in result.events) diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py new file mode 100644 index 00000000..51e2bcab --- /dev/null +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -0,0 +1,485 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Any +from unittest.mock import MagicMock + +import pytest + +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector + + +def make_cmd_result(stdout: str, stderr: str = "", exit_code: int = 0) -> MagicMock: + """Create a mock command result""" + result = MagicMock() + result.stdout = stdout + result.stderr = stderr + result.exit_code = exit_code + return result + + +def make_json_response(data: Any) -> str: + """Convert data to JSON string""" + return json.dumps(data) + + +@pytest.fixture +def mock_commands(monkeypatch): + """Mock all amd-smi commands with sample data""" + + def mock_run_sut_cmd(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + + if "version --json" in cmd: + return make_cmd_result( + make_json_response( + [{"tool": "amdsmi", "amdsmi_library_version": "1.2.3", "rocm_version": "6.1.0"}] + ) + ) + + if "list --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "bdf": "0000:0b:00.0", + "uuid": "GPU-UUID-123", + "kfd_id": 7, + "node_id": 3, + "partition_id": 0, + } + ] + ) + ) + + if "process --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "process_list": [ + { + "name": "python", + "pid": 4242, + "mem": 1024, + "engine_usage": {"gfx": 1000000, "enc": 0}, + "memory_usage": { + "gtt_mem": 0, + "cpu_mem": 4096, + "vram_mem": 2048, + }, + "cu_occupancy": 12, + }, + { + "name": "test", + "pid": 9999, + "mem": 0, + "engine_usage": {"gfx": 0, "enc": 0}, + "memory_usage": {"gtt_mem": 0, "cpu_mem": 0, "vram_mem": 0}, + "cu_occupancy": 0, + }, + ], + } + ] + ) + ) + + if "partition --json" in cmd: + json_output = ( + make_json_response( + [{"gpu": 0, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}] + ) + + "\n" + + make_json_response( + [{"gpu": 1, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}] + ) + + "\n" + + make_json_response( + [{"gpu_id": "N/A", "profile_index": "N/A", "partition_id": "0"}] + ) + + "\n\nLegend:\n * = Current mode" + ) + return make_cmd_result(json_output) + + if "firmware --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "fw_list": [ + {"fw_name": "SMU", "fw_version": "55.33"}, + {"fw_name": "VBIOS", "fw_version": "V1"}, + ], + } + ] + ) + ) + + if "static -g all --json" in cmd: + return make_cmd_result( + make_json_response( + { + "gpu_data": [ + { + "gpu": 0, + "asic": { + "market_name": "SomeGPU", + "vendor_id": "1002", + "vendor_name": "AMD", + "subvendor_id": "1ABC", + "device_id": "0x1234", + "subsystem_id": "0x5678", + "rev_id": "A1", + "asic_serial": "ASERIAL", + "oam_id": 0, + "num_compute_units": 224, + "target_graphics_version": "GFX940", + "vram_type": "HBM3", + "vram_vendor": "Micron", + "vram_bit_width": 4096, + }, + "board": { + "model_number": "Board-42", + "product_serial": "SN0001", + "fru_id": "FRU-1", + "product_name": "ExampleBoard", + "manufacturer_name": "ACME", + }, + "bus": { + "bdf": "0000:0b:00.0", + "max_pcie_width": 16, + "max_pcie_speed": 16.0, + "pcie_interface_version": "PCIe 5.0", + "slot_type": "PCIe", + }, + "vbios": { + "vbios_name": "vbiosA", + "vbios_build_date": "2024-01-01", + "vbios_part_number": "PN123", + "vbios_version": "V1", + }, + "driver": {"driver_name": "amdgpu", "driver_version": "6.1.0"}, + "numa": {"node": 3, "affinity": 0}, + "vram": { + "vram_type": "HBM3", + "vram_vendor": "Micron", + "vram_bit_width": 4096, + "vram_size_mb": 65536, + }, + "cache": { + "cache": [ + { + "cache_level": 1, + "max_num_cu_shared": 8, + "num_cache_instance": 32, + "cache_size": 262144, + "cache_properties": "PropertyA, PropertyB; PropertyC", + } + ] + }, + "clock": {"frequency": [500, 1500, 2000], "current": 1}, + "soc_pstate": {}, + "xgmi_plpd": {}, + } + ] + } + ) + ) + + return make_cmd_result("", f"Unknown command: {cmd}", 1) + + return mock_run_sut_cmd + + +@pytest.fixture +def collector(mock_commands, conn_mock, system_info, monkeypatch): + """Create a collector with mocked commands""" + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_commands) + return c + + +def test_check_amdsmi_installed(collector): + """Test that _check_amdsmi_installed works""" + assert collector._check_amdsmi_installed() is True + + +def test_check_amdsmi_not_installed(conn_mock, system_info, monkeypatch): + """Test when amd-smi is not installed""" + + def mock_which_fail(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("", "no amd-smi in /usr/bin", 1) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_which_fail) + + result, data = c.collect_data() + assert data is None + assert result.status.name == "NOT_RAN" + + +def test_collect_data(collector): + """Test full data collection""" + result, data = collector.collect_data() + assert data is not None + assert data.version is not None + assert data.version.tool == "amdsmi" + assert data.version.version == "1.2.3" + assert data.version.rocm_version == "6.1.0" + + # gpu_list + assert data.gpu_list is not None and len(data.gpu_list) == 1 + assert data.gpu_list[0].bdf == "0000:0b:00.0" + assert data.gpu_list[0].uuid == "GPU-UUID-123" + assert data.gpu_list[0].kfd_id == 7 + assert data.gpu_list[0].node_id == 3 + + # processes + assert data.process is not None and len(data.process) == 1 + assert len(data.process[0].process_list) == 2 + + assert data.partition is not None + assert len(data.partition.memory_partition) >= 1 + assert data.partition.memory_partition[0].partition_type == "NPS1" + + # firmware + assert data.firmware is not None and len(data.firmware) == 1 + assert len(data.firmware[0].fw_list) == 2 + + # static + assert data.static is not None and len(data.static) == 1 + s = data.static[0] + assert s.bus is not None and s.bus.max_pcie_speed is not None + assert float(s.bus.max_pcie_speed.value) == pytest.approx(16.0) + assert s.bus.pcie_interface_version == "PCIe 5.0" + + +def test_get_gpu_list(collector): + """Test GPU list parsing""" + gpu_list = collector.get_gpu_list() + assert gpu_list is not None and len(gpu_list) == 1 + assert gpu_list[0].gpu == 0 + assert gpu_list[0].bdf == "0000:0b:00.0" + assert gpu_list[0].uuid == "GPU-UUID-123" + + +def test_get_process(collector): + """Test process list parsing""" + procs = collector.get_process() + assert procs is not None and len(procs) == 1 + assert procs[0].gpu == 0 + assert len(procs[0].process_list) == 2 + + p0 = procs[0].process_list[0].process_info + assert p0.name == "python" + assert p0.pid == 4242 + assert p0.mem_usage is not None and p0.mem_usage.unit == "B" + assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns" + + p1 = procs[0].process_list[1].process_info + assert p1.name == "test" + assert p1.pid == 9999 + + +def test_get_partition(collector): + """Test partition parsing with multi-JSON output""" + p = collector.get_partition() + assert p is not None + assert len(p.memory_partition) >= 1 + assert p.memory_partition[0].partition_type == "NPS1" + + +def test_get_firmware(collector): + """Test firmware parsing""" + fw = collector.get_firmware() + assert fw is not None and len(fw) == 1 + assert fw[0].gpu == 0 + assert len(fw[0].fw_list) == 2 + assert fw[0].fw_list[0].fw_id == "SMU" + assert fw[0].fw_list[0].fw_version == "55.33" + + +def test_get_static(collector): + """Test static data parsing""" + stat = collector.get_static() + assert stat is not None and len(stat) == 1 + s = stat[0] + + # ASIC + assert s.asic.market_name == "SomeGPU" + assert s.asic.vendor_name == "AMD" + assert s.asic.num_compute_units == 224 + + # Board + assert s.board.amdsmi_model_number == "Board-42" + assert s.board.manufacturer_name == "ACME" + + # Bus/PCIe + assert s.bus.bdf == "0000:0b:00.0" + assert s.bus.max_pcie_width is not None + assert s.bus.max_pcie_speed is not None + + # VRAM + assert s.vram.type == "HBM3" + assert s.vram.vendor == "Micron" + + # Cache + assert s.cache_info is not None and len(s.cache_info) == 1 + cache = s.cache_info[0] + assert cache.cache_level.value == 1 + assert cache.cache_properties + + if s.clock is not None: + assert isinstance(s.clock, dict) + if "clk" in s.clock and s.clock["clk"] is not None: + assert s.clock["clk"].frequency_levels is not None + + +def test_cache_properties_parsing(collector): + """Test cache properties string parsing""" + stat = collector.get_static() + item = stat[0].cache_info[0] + assert isinstance(item.cache.value, str) and item.cache.value.startswith("Label_") + assert item.cache_properties + assert {"PropertyA", "PropertyB", "PropertyC"}.issubset(set(item.cache_properties)) + + +def test_json_parse_error(conn_mock, system_info, monkeypatch): + """Test handling of malformed JSON""" + + def mock_bad_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "version --json" in cmd: + return make_cmd_result("{ invalid json }") + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_bad_json) + + result, data = c.collect_data() + assert data is not None + assert data.version is None + assert len(result.events) > 0 + + +def test_command_error(conn_mock, system_info, monkeypatch): + """Test handling of command execution errors""" + + def mock_cmd_error(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + return make_cmd_result("", "Command failed", 1) + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_cmd_error) + + result, data = c.collect_data() + assert data is not None + assert data.version is None + assert data.gpu_list == [] + assert len(result.events) > 0 + + +def test_multi_json_parsing(conn_mock, system_info, monkeypatch): + """Test parsing of multiple JSON objects with trailing text""" + + def mock_multi_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "test --json" in cmd: + multi_json = ( + '[{"data": 1}]\n' + '[{"data": 2}]\n' + '[{"data": 3}]\n' + "\n\nLegend:\n * = Current mode\n" + ) + return make_cmd_result(multi_json) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_multi_json) + + result = c._run_amd_smi_dict("test") + + assert result is not None + assert isinstance(result, list) + assert len(result) == 3 + assert result[0] == [{"data": 1}] + assert result[1] == [{"data": 2}] + assert result[2] == [{"data": 3}] + + +def test_single_json_parsing(conn_mock, system_info, monkeypatch): + """Test that single JSON parsing still works""" + + def mock_single_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "version --json" in cmd: + return make_cmd_result(make_json_response([{"tool": "amdsmi", "version": "1.0"}])) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_single_json) + + result = c._run_amd_smi_dict("version") + + assert result is not None + assert isinstance(result, list) + assert len(result) == 1 + assert result[0]["tool"] == "amdsmi" diff --git a/test/unit/plugin/test_analyzer_args_build_from_model.py b/test/unit/plugin/test_analyzer_args_build_from_model.py new file mode 100644 index 00000000..e6eb7485 --- /dev/null +++ b/test/unit/plugin/test_analyzer_args_build_from_model.py @@ -0,0 +1,220 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +""" +Test suite for all analyzer_args build_from_model methods. +Ensures that build_from_model includes all required fields. +""" + +from nodescraper.plugins.inband.bios.analyzer_args import BiosAnalyzerArgs +from nodescraper.plugins.inband.bios.biosdata import BiosDataModel +from nodescraper.plugins.inband.cmdline.analyzer_args import CmdlineAnalyzerArgs +from nodescraper.plugins.inband.cmdline.cmdlinedata import CmdlineDataModel +from nodescraper.plugins.inband.device_enumeration.analyzer_args import ( + DeviceEnumerationAnalyzerArgs, +) +from nodescraper.plugins.inband.device_enumeration.deviceenumdata import ( + DeviceEnumerationDataModel, +) +from nodescraper.plugins.inband.dkms.analyzer_args import DkmsAnalyzerArgs +from nodescraper.plugins.inband.dkms.dkmsdata import DkmsDataModel +from nodescraper.plugins.inband.kernel.analyzer_args import KernelAnalyzerArgs +from nodescraper.plugins.inband.kernel.kerneldata import KernelDataModel +from nodescraper.plugins.inband.kernel_module.analyzer_args import ( + KernelModuleAnalyzerArgs, +) +from nodescraper.plugins.inband.kernel_module.kernel_module_data import ( + KernelModuleDataModel, +) +from nodescraper.plugins.inband.memory.analyzer_args import MemoryAnalyzerArgs +from nodescraper.plugins.inband.memory.memorydata import MemoryDataModel +from nodescraper.plugins.inband.os.analyzer_args import OsAnalyzerArgs +from nodescraper.plugins.inband.os.osdata import OsDataModel +from nodescraper.plugins.inband.package.analyzer_args import PackageAnalyzerArgs +from nodescraper.plugins.inband.package.packagedata import PackageDataModel +from nodescraper.plugins.inband.process.analyzer_args import ProcessAnalyzerArgs +from nodescraper.plugins.inband.process.processdata import ProcessDataModel +from nodescraper.plugins.inband.rocm.analyzer_args import RocmAnalyzerArgs +from nodescraper.plugins.inband.rocm.rocmdata import RocmDataModel +from nodescraper.plugins.inband.sysctl.analyzer_args import SysctlAnalyzerArgs +from nodescraper.plugins.inband.sysctl.sysctldata import SysctlDataModel + + +def test_package_analyzer_args_build_from_model(): + """Test PackageAnalyzerArgs.build_from_model includes all fields""" + datamodel = PackageDataModel(version_info={"package1": "1.0.0", "package2": "2.0.0"}) + args = PackageAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, PackageAnalyzerArgs) + assert args.exp_package_ver == {"package1": "1.0.0", "package2": "2.0.0"} + + +def test_device_enumeration_analyzer_args_build_from_model(): + """Test DeviceEnumerationAnalyzerArgs.build_from_model includes all fields""" + datamodel = DeviceEnumerationDataModel(cpu_count=2, gpu_count=8, vf_count=0) + args = DeviceEnumerationAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, DeviceEnumerationAnalyzerArgs) + assert args.cpu_count == [2] + assert args.gpu_count == [8] + assert args.vf_count == [0] + + +def test_device_enumeration_analyzer_args_build_from_model_with_none(): + """Test DeviceEnumerationAnalyzerArgs.build_from_model with None values""" + datamodel = DeviceEnumerationDataModel(cpu_count=None, gpu_count=4, vf_count=None) + args = DeviceEnumerationAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, DeviceEnumerationAnalyzerArgs) + assert args.cpu_count is None + assert args.gpu_count == [4] + assert args.vf_count is None + + +def test_kernel_analyzer_args_build_from_model(): + """Test KernelAnalyzerArgs.build_from_model includes all fields""" + datamodel = KernelDataModel( + kernel_info="Linux hostname 5.15.0-56-generic #62-Ubuntu SMP x86_64 GNU/Linux", + kernel_version="5.15.0-56-generic", + ) + args = KernelAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, KernelAnalyzerArgs) + assert args.exp_kernel == ["5.15.0-56-generic"] + + +def test_rocm_analyzer_args_build_from_model(): + """Test RocmAnalyzerArgs.build_from_model includes all fields""" + datamodel = RocmDataModel(rocm_version="5.4.0", rocm_latest_versioned_path="/opt/rocm-5.4.0") + args = RocmAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, RocmAnalyzerArgs) + assert args.exp_rocm == ["5.4.0"] + assert args.exp_rocm_latest == "/opt/rocm-5.4.0" + + +def test_os_analyzer_args_build_from_model(): + """Test OsAnalyzerArgs.build_from_model includes all fields""" + datamodel = OsDataModel(os_name="Ubuntu 22.04", os_version="22.04") + args = OsAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, OsAnalyzerArgs) + assert args.exp_os == ["Ubuntu 22.04"] + + +def test_bios_analyzer_args_build_from_model(): + """Test BiosAnalyzerArgs.build_from_model includes all fields""" + datamodel = BiosDataModel(bios_version="1.2.3") + args = BiosAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, BiosAnalyzerArgs) + assert args.exp_bios_version == ["1.2.3"] + + +def test_cmdline_analyzer_args_build_from_model(): + """Test CmdlineAnalyzerArgs.build_from_model includes all fields""" + datamodel = CmdlineDataModel(cmdline="iommu=pt intel_iommu=on") + args = CmdlineAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, CmdlineAnalyzerArgs) + assert args.required_cmdline == ["iommu=pt intel_iommu=on"] + + +def test_dkms_analyzer_args_build_from_model(): + """Test DkmsAnalyzerArgs.build_from_model includes all fields""" + datamodel = DkmsDataModel(status="installed", version="6.8.5-6.8.5") + args = DkmsAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, DkmsAnalyzerArgs) + assert args.dkms_status == ["installed"] + assert args.dkms_version == ["6.8.5-6.8.5"] + + +def test_sysctl_analyzer_args_build_from_model(): + """Test SysctlAnalyzerArgs.build_from_model includes all fields""" + datamodel = SysctlDataModel( + vm_swappiness=60, + vm_numa_balancing=1, + vm_oom_kill_allocating_task=0, + vm_compaction_proactiveness=20, + vm_compact_unevictable_allowed=1, + vm_extfrag_threshold=500, + vm_zone_reclaim_mode=0, + vm_dirty_background_ratio=10, + vm_dirty_ratio=20, + vm_dirty_writeback_centisecs=500, + kernel_numa_balancing=1, + ) + args = SysctlAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, SysctlAnalyzerArgs) + assert args.exp_vm_swappiness == 60 + assert args.exp_vm_numa_balancing == 1 + assert args.exp_vm_oom_kill_allocating_task == 0 + assert args.exp_vm_compaction_proactiveness == 20 + assert args.exp_vm_compact_unevictable_allowed == 1 + assert args.exp_vm_extfrag_threshold == 500 + assert args.exp_vm_zone_reclaim_mode == 0 + assert args.exp_vm_dirty_background_ratio == 10 + assert args.exp_vm_dirty_ratio == 20 + assert args.exp_vm_dirty_writeback_centisecs == 500 + assert args.exp_kernel_numa_balancing == 1 + + +def test_process_analyzer_args_build_from_model(): + """Test ProcessAnalyzerArgs.build_from_model includes all fields""" + datamodel = ProcessDataModel(kfd_process=5, cpu_usage=15.5) + args = ProcessAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, ProcessAnalyzerArgs) + assert args.max_kfd_processes == 5 + assert args.max_cpu_usage == 15.5 + + +def test_kernel_module_analyzer_args_build_from_model(): + """Test KernelModuleAnalyzerArgs.build_from_model includes all fields""" + datamodel = KernelModuleDataModel( + kernel_modules={ + "amdgpu": {"size": 1024, "used": 0}, + "amd_iommu": {"size": 512, "used": 1}, + "other_module": {"size": 256, "used": 0}, + } + ) + args = KernelModuleAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, KernelModuleAnalyzerArgs) + assert "amdgpu" in args.kernel_modules + assert "amd_iommu" in args.kernel_modules + assert "other_module" not in args.kernel_modules + assert args.regex_filter == [] + + +def test_memory_analyzer_args_build_from_model(): + """Test MemoryAnalyzerArgs.build_from_model includes all fields""" + datamodel = MemoryDataModel(mem_free="128Gi", mem_total="256Gi") + args = MemoryAnalyzerArgs.build_from_model(datamodel) + + assert isinstance(args, MemoryAnalyzerArgs) + assert args.memory_threshold == "256Gi" diff --git a/test/unit/plugin/test_device_enumeration_analyzer.py b/test/unit/plugin/test_device_enumeration_analyzer.py new file mode 100644 index 00000000..c58c9ea0 --- /dev/null +++ b/test/unit/plugin/test_device_enumeration_analyzer.py @@ -0,0 +1,127 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest + +from nodescraper.enums.eventcategory import EventCategory +from nodescraper.enums.eventpriority import EventPriority +from nodescraper.enums.executionstatus import ExecutionStatus +from nodescraper.models.systeminfo import OSFamily +from nodescraper.plugins.inband.device_enumeration.analyzer_args import ( + DeviceEnumerationAnalyzerArgs, +) +from nodescraper.plugins.inband.device_enumeration.device_enumeration_analyzer import ( + DeviceEnumerationAnalyzer, +) +from nodescraper.plugins.inband.device_enumeration.deviceenumdata import ( + DeviceEnumerationDataModel, +) + + +@pytest.fixture +def device_enumeration_analyzer(system_info): + return DeviceEnumerationAnalyzer(system_info=system_info) + + +@pytest.fixture +def device_enumeration_data(): + return DeviceEnumerationDataModel(cpu_count=4, gpu_count=4, vf_count=8) + + +def test_analyze_passing_linux(system_info, device_enumeration_analyzer, device_enumeration_data): + """Test a normal passing case with matching config""" + system_info.os_family = OSFamily.LINUX + + args = DeviceEnumerationAnalyzerArgs(cpu_count=4, gpu_count=4, vf_count=8) + + result = device_enumeration_analyzer.analyze_data(data=device_enumeration_data, args=args) + + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_analyze_passing_windows(system_info, device_enumeration_analyzer, device_enumeration_data): + """Test a normal passing case on Windows""" + system_info.os_family = OSFamily.WINDOWS + + args = DeviceEnumerationAnalyzerArgs(gpu_count=4, vf_count=8) + + result = device_enumeration_analyzer.analyze_data(data=device_enumeration_data, args=args) + + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_analyze_no_args(device_enumeration_analyzer, device_enumeration_data): + """Test with no analyzer args provided - should skip analysis""" + + result = device_enumeration_analyzer.analyze_data(data=device_enumeration_data, args=None) + + assert result.status == ExecutionStatus.NOT_RAN + assert "Expected Device Enumeration data not provided, skipping analysis." in result.message + assert len(result.events) == 0 + + +def test_analyze_unexpected_counts(device_enumeration_analyzer, device_enumeration_data): + """Test with config specifying different device counts""" + + args = DeviceEnumerationAnalyzerArgs(cpu_count=1, gpu_count=10) + + result = device_enumeration_analyzer.analyze_data(data=device_enumeration_data, args=args) + + assert result.status == ExecutionStatus.ERROR + assert "but got" in result.message + + for event in result.events: + assert event.priority == EventPriority.CRITICAL + assert event.category == EventCategory.PLATFORM.value + + +def test_analyze_mismatched_cpu_count(device_enumeration_analyzer): + """Test with invalid device enumeration on SUT""" + + data = DeviceEnumerationDataModel(cpu_count=5, gpu_count=4, vf_count=8) + args = DeviceEnumerationAnalyzerArgs(cpu_count=4, gpu_count=4) + + result = device_enumeration_analyzer.analyze_data(data=data, args=args) + + assert result.status == ExecutionStatus.ERROR + assert "but got" in result.message + + for event in result.events: + assert event.priority == EventPriority.CRITICAL + assert event.category == EventCategory.PLATFORM.value + + +def test_analyze_list_of_accepted_counts(device_enumeration_analyzer): + """Test with a list of acceptable counts""" + + data = DeviceEnumerationDataModel(cpu_count=4, gpu_count=4, vf_count=8) + args = DeviceEnumerationAnalyzerArgs(cpu_count=[2, 4, 8], gpu_count=[4, 8]) + + result = device_enumeration_analyzer.analyze_data(data=data, args=args) + + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 diff --git a/test/unit/plugin/test_device_enumeration_collector.py b/test/unit/plugin/test_device_enumeration_collector.py new file mode 100644 index 00000000..795611a6 --- /dev/null +++ b/test/unit/plugin/test_device_enumeration_collector.py @@ -0,0 +1,164 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from unittest.mock import MagicMock + +import pytest + +from nodescraper.enums.executionstatus import ExecutionStatus +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.models.systeminfo import OSFamily +from nodescraper.plugins.inband.device_enumeration.device_enumeration_collector import ( + DeviceEnumerationCollector, +) +from nodescraper.plugins.inband.device_enumeration.deviceenumdata import ( + DeviceEnumerationDataModel, +) + + +@pytest.fixture +def device_enumeration_collector(system_info, conn_mock): + return DeviceEnumerationCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + + +def test_collect_linux(system_info, device_enumeration_collector): + """Test linux typical output""" + system_info.os_family = OSFamily.LINUX + + lscpu_output = "Architecture: x86_64\nCPU(s): 64\nSocket(s): 2" + lshw_output = "*-cpu\n product: AMD EPYC 1234 64-Core Processor" + + device_enumeration_collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock( + exit_code=0, + stdout=lscpu_output, + stderr="", + command="lscpu", + ), + MagicMock( + exit_code=0, + stdout="8", + stderr="", + command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l", + ), + MagicMock( + exit_code=0, + stdout="0", + stderr="", + command="lspci -d 1002: | grep -i 'Virtual Function' | wc -l", + ), + MagicMock( + exit_code=0, + stdout=lshw_output, + stderr="", + command="lshw", + ), + ] + ) + + result, data = device_enumeration_collector.collect_data() + assert result.status == ExecutionStatus.OK + assert data == DeviceEnumerationDataModel( + cpu_count=2, gpu_count=8, vf_count=0, lscpu_output=lscpu_output, lshw_output=lshw_output + ) + assert ( + len([a for a in result.artifacts if hasattr(a, "filename") and a.filename == "lshw.txt"]) + == 1 + ) + + +def test_collect_windows(system_info, device_enumeration_collector): + """Test windows typical output""" + system_info.os_family = OSFamily.WINDOWS + + device_enumeration_collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock( + exit_code=0, + stdout="2", + stderr="", + command='powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"', + ), + MagicMock( + exit_code=0, + stdout="8", + stderr="", + command='powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"', + ), + MagicMock( + exit_code=0, + stdout="8", + stderr="", + command='powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count"', + ), + ] + ) + + result, data = device_enumeration_collector.collect_data() + assert result.status == ExecutionStatus.OK + assert data == DeviceEnumerationDataModel(cpu_count=2, gpu_count=8, vf_count=8) + + +def test_collect_error(system_info, device_enumeration_collector): + """Test with bad exit code""" + system_info.os_family = OSFamily.LINUX + + device_enumeration_collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock( + exit_code=1, + stdout="", + stderr="command failed", + command="lscpu", + ), + MagicMock( + exit_code=1, + stdout="some output", + stderr="command failed", + command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l", + ), + MagicMock( + exit_code=1, + stdout="some output", + stderr="command failed", + command="lspci -d 1002: | grep -i 'Virtual Function' | wc -l", + ), + MagicMock( + exit_code=1, + stdout="", + stderr="command failed", + command="lshw", + ), + ] + ) + + result, data = device_enumeration_collector.collect_data() + assert result.status == ExecutionStatus.EXECUTION_FAILURE + assert data is None diff --git a/test/unit/plugin/test_dimms_collector.py b/test/unit/plugin/test_dimms_collector.py index ac7aa98d..eeaa15ff 100644 --- a/test/unit/plugin/test_dimms_collector.py +++ b/test/unit/plugin/test_dimms_collector.py @@ -68,10 +68,16 @@ def test_run_linux(collector, system_info): system_info.os_family = OSFamily.LINUX collector._run_sut_cmd = MagicMock( - return_value=MagicMock( - exit_code=0, - stdout="Size: 64 GB\nSize: 64 GB\nSize: 128 GB\n", - ) + side_effect=[ + MagicMock( + exit_code=0, + stdout="Full dmidecode output...", + ), + MagicMock( + exit_code=0, + stdout="Size: 64 GB\nSize: 64 GB\nSize: 128 GB\n", + ), + ] ) result, data = collector.collect_data() @@ -84,15 +90,23 @@ def test_run_linux_error(collector, system_info): system_info.os_family = OSFamily.LINUX collector._run_sut_cmd = MagicMock( - return_value=MagicMock( - exit_code=1, - stderr="Error occurred", - ) + side_effect=[ + MagicMock( + exit_code=1, + stderr="Error occurred", + command="dmidecode", + ), + MagicMock( + exit_code=1, + stderr="Error occurred", + command="sh -c 'dmidecode -t 17 | ...'", + ), + ] ) result, data = collector.collect_data() assert result.status == ExecutionStatus.ERROR assert data is None - assert result.events[0].category == EventCategory.OS.value - assert result.events[0].description == "Error checking dimms" + assert result.events[1].category == EventCategory.OS.value + assert result.events[1].description == "Error checking dimms" diff --git a/test/unit/plugin/test_dmesg_collector.py b/test/unit/plugin/test_dmesg_collector.py index d3c4553a..4202c0f9 100644 --- a/test/unit/plugin/test_dmesg_collector.py +++ b/test/unit/plugin/test_dmesg_collector.py @@ -32,6 +32,7 @@ from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.interfaces.task import SystemCompatibilityError from nodescraper.models.systeminfo import OSFamily +from nodescraper.plugins.inband.dmesg.collector_args import DmesgCollectorArgs from nodescraper.plugins.inband.dmesg.dmesg_collector import DmesgCollector from nodescraper.plugins.inband.dmesg.dmesgdata import DmesgData @@ -276,3 +277,27 @@ def run_map(cmd, **kwargs): assert isinstance(data, DmesgData) assert data.dmesg_content == "DMESG OUTPUT\n" + + +def test_collect_data_with_args(conn_mock, system_info): + """Test collect_data accepts DmesgCollectorArgs""" + dmesg = "2023-06-01T01:00:00,685236-05:00 test message1\n" + conn_mock.run_command.return_value = CommandArtifact( + exit_code=0, + stdout=dmesg, + stderr="", + command="dmesg --time-format iso", + ) + + collector = DmesgCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.INTERACTIVE, + connection=conn_mock, + ) + + args = DmesgCollectorArgs(log_dmesg_data=False) + res, data = collector.collect_data(args=args) + + assert res.status == ExecutionStatus.OK + assert data is not None + assert data.dmesg_content == dmesg diff --git a/test/unit/plugin/test_kernel_analyzer.py b/test/unit/plugin/test_kernel_analyzer.py index 7d9c7d0a..81785abd 100644 --- a/test/unit/plugin/test_kernel_analyzer.py +++ b/test/unit/plugin/test_kernel_analyzer.py @@ -35,7 +35,10 @@ @pytest.fixture def model_obj(): - return KernelDataModel(kernel_version="5.13.0-30-generic") + return KernelDataModel( + kernel_info="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", + kernel_version="5.13.0-30-generic", + ) @pytest.fixture @@ -118,14 +121,14 @@ def test_invalid_kernel_config(system_info, model_obj, config): def test_match_regex(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=[r"5.13.\d-\d+-[\w]+"], regex_match=True) + args = KernelAnalyzerArgs(exp_kernel=[r".*5\.13\.\d+-\d+-[\w-]+.*"], regex_match=True) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) assert result.status == ExecutionStatus.OK def test_mismatch_regex(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=[r"4.3.\d-\d+-[\w]+"], regex_match=True) + args = KernelAnalyzerArgs(exp_kernel=[r".*4\.13\.\d+-\d+-[\w-]+.*"], regex_match=True) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) diff --git a/test/unit/plugin/test_kernel_collector.py b/test/unit/plugin/test_kernel_collector.py index d9261ad9..b1f26257 100644 --- a/test/unit/plugin/test_kernel_collector.py +++ b/test/unit/plugin/test_kernel_collector.py @@ -53,7 +53,9 @@ def test_run_windows(collector, conn_mock): result, data = collector.collect_data() - assert data == KernelDataModel(kernel_version="10.0.19041.1237") + assert data == KernelDataModel( + kernel_info="Version=10.0.19041.1237", kernel_version="10.0.19041.1237" + ) assert result.status == ExecutionStatus.OK @@ -61,14 +63,17 @@ def test_run_linux(collector, conn_mock): collector.system_info.os_family = OSFamily.LINUX conn_mock.run_command.return_value = CommandArtifact( exit_code=0, - stdout="5.4.0-88-generic", + stdout="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", stderr="", - command="sh -c 'uname -r'", + command="sh -c 'uname -a'", ) result, data = collector.collect_data() - assert data == KernelDataModel(kernel_version="5.4.0-88-generic") + assert data == KernelDataModel( + kernel_info="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", + kernel_version="5.13.0-30-generic", + ) assert result.status == ExecutionStatus.OK diff --git a/test/unit/plugin/test_kernel_module_analyzer.py b/test/unit/plugin/test_kernel_module_analyzer.py index 0d824379..032b7833 100644 --- a/test/unit/plugin/test_kernel_module_analyzer.py +++ b/test/unit/plugin/test_kernel_module_analyzer.py @@ -1,3 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### import pytest from nodescraper.enums.eventcategory import EventCategory diff --git a/test/unit/plugin/test_kernel_module_collector.py b/test/unit/plugin/test_kernel_module_collector.py index 65ffae6e..9d4685a8 100644 --- a/test/unit/plugin/test_kernel_module_collector.py +++ b/test/unit/plugin/test_kernel_module_collector.py @@ -1,3 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + from types import SimpleNamespace import pytest @@ -8,9 +34,96 @@ ) from nodescraper.plugins.inband.kernel_module.kernel_module_data import ( KernelModuleDataModel, + ModuleInfo, ) +@pytest.fixture +def modinfo_amdgpu_full(): + """Fixture providing comprehensive dummy modinfo amdgpu output for testing.""" + return """filename: /lib/modules/1.2.3-test-kernel/extra/amdgpu_test.ko.xz +version: 99.88.77 +license: GPL and additional rights +description: AMD GPU Test Module +author: Test Author One +firmware: amdgpu/test_gpu_info_v1.bin +firmware: amdgpu/test_gpu_info_v2.bin +firmware: amdgpu/dummy_gpu_info_a.bin +firmware: amdgpu/dummy_gpu_info_b.bin +firmware: amdgpu/fake_gpu_info_c.bin +firmware: amdgpu/sample_gpu_info_d.bin +firmware: amdgpu/example_gpu_info_e.bin +firmware: amdgpu/mock_gpu_info_f.bin +firmware: amdgpu/test_gpu_info_g.bin +firmware: amdgpu/dummy_gpu_info_h.bin +firmware: amdgpu/fake_gpu_info_i.bin +firmware: amdgpu/sample_gpu_info_j.bin +srcversion: ABCD1234567890TESTVER +depends: dummy_dep1,dummy_dep2,test_dep3,fake_dep4,sample_dep5,example_dep6,mock_dep7 +retpoline: Y +intree: Y +name: amdgpu +vermagic: 1.2.3-test-kernel SMP mod_unload modversions +sig_id: TEST#99 +signer: Test Signing Authority (dummy key 1) +sig_key: 00:11:22:33:44:55:66:77:88:99:AA:BB:CC:DD:EE:FF:00:11:22:33 +sig_hashalgo: test256 +parm: test_param1:Test parameter description one (int) +parm: test_param2:Test parameter description two (int) +parm: dummy_param3:Dummy parameter description (uint) +parm: fake_param4:Fake parameter description (int) +parm: sample_param5:Sample parameter description (int) +parm: example_param6:Example parameter (int) +parm: mock_param7:Mock parameter (int) +parm: test_audio:Test audio parameter (int) +parm: dummy_priority:Dummy priority parameter (int) +parm: fake_i2c:Fake i2c parameter (int) +parm: sample_gen_cap:Sample gen caps parameter (uint) +parm: example_msi:Example MSI parameter (int) +parm: mock_timeout:Mock timeout parameter (string) +parm: test_dpm:Test DPM parameter (int) +parm: dummy_fw_load:Dummy firmware load parameter (int) +parm: fake_aspm:Fake ASPM parameter (int) +parm: sample_runpm:Sample runtime PM parameter (int) +parm: example_ip_mask:Example IP block mask (uint) +parm: mock_bapm:Mock BAPM parameter (int) +parm: test_deep_color:Test deep color parameter (int) +parm: dummy_vm_size:Dummy VM size parameter (uint) +parm: fake_vm_fragment:Fake VM fragment parameter (uint) +parm: sample_vm_block:Sample VM block parameter (uint) +parm: example_vm_fault:Example VM fault parameter (int) +parm: mock_vm_debug:Mock VM debug parameter (int) +parm: test_vm_update:Test VM update parameter (int) +parm: dummy_exp_hw:Dummy experimental HW parameter (int) +parm: fake_dc:Fake display core parameter (int) +parm: sample_sched_jobs:Sample scheduler jobs parameter (int) +parm: example_sched_hw:Example scheduler HW parameter (int) +parm: mock_ppfeaturemask:Mock power feature mask (uint) +parm: test_longtraining:Test long training parameter (bool) +parm: dummy_pcie_gen2:Dummy PCIe gen2 parameter (int) +parm: fake_mst:Fake MST parameter (int) +parm: sample_mcbp:Sample MCBP parameter (int) +parm: example_disable_cu:Example disable CU parameter (charp) +parm: mock_sched_policy:Mock scheduler policy (int) +parm: test_hws_max_proc:Test HWS max processes (int) +parm: dummy_cwsr_enable:Dummy CWSR enable parameter (int) +parm: fake_max_queues:Fake max queues parameter (int) +parm: sample_send_sigterm:Sample send sigterm parameter (int) +parm: example_debug_largebar:Example debug largebar parameter (int) +parm: mock_ignore_crat:Mock ignore CRAT parameter (int) +parm: test_halt_hws_hang:Test halt HWS hang parameter (int) +parm: dummy_hws_gws:Dummy HWS GWS parameter (bool) +parm: fake_queue_preempt_timeout:Fake queue preemption timeout (int) +parm: sample_dcfeaturemask:Sample DC feature mask (uint) +parm: example_dcdebugmask:Example DC debug mask (uint) +parm: mock_abmlevel:Mock ABM level (uint) +parm: test_tmz:Test TMZ parameter (int) +parm: dummy_reset_method:Dummy reset method (int) +parm: fake_bad_page_threshold:Fake bad page threshold (int) +parm: sample_num_kcq:Sample number of KCQ (int) +""" + + @pytest.fixture def linux_collector(system_info, conn_mock): system_info.os_family = OSFamily.LINUX @@ -70,6 +183,7 @@ def test_collect_data_linux_success(linux_collector): seq = [ make_artifact("cat /proc/modules", 0, "m1 0 0 - Live\n"), make_artifact("ls /sys/module/m1/parameters", 1, ""), + make_artifact("modinfo amdgpu", 1, ""), ] linux_collector._run_sut_cmd = lambda cmd, seq=seq: seq.pop(0) @@ -77,11 +191,11 @@ def test_collect_data_linux_success(linux_collector): assert result.status == ExecutionStatus.OK assert isinstance(data, KernelModuleDataModel) - evt = result.events[-1] - assert evt.category == "KERNEL_READ" + evt = [e for e in result.events if e.category == "KERNEL_READ"][-1] assert evt.priority == EventPriority.INFO.value - assert result.message == "1 kernel modules collected" + assert "1 kernel modules collected" in result.message assert data.kernel_modules == {"m1": {"parameters": {}}} + assert data.amdgpu_modinfo is None def test_collect_data_linux_error(linux_collector): @@ -114,3 +228,226 @@ def test_collect_data_windows_not_found(win_collector): result, data = win_collector.collect_data() assert result.status == ExecutionStatus.ERROR assert data is None + + +def test_parse_modinfo_empty(linux_collector): + """Test parsing of empty modinfo output.""" + result = linux_collector._parse_modinfo("") + assert result is None + + +def test_parse_modinfo_basic(linux_collector): + """Test parsing of basic modinfo output.""" + modinfo_output = """filename: /lib/modules/1.0.0-test/extra/amdgpu_dummy.ko.xz +version: 10.20.30 +license: GPL and additional rights +description: AMD GPU Test Module +author: Test Developer +srcversion: ABC123DEF456TEST789 +depends: test_dep1,test_dep2,test_dep3 +name: amdgpu +vermagic: 1.0.0-test SMP mod_unload modversions +sig_id: TEST#1 +signer: test_signer +""" + result = linux_collector._parse_modinfo(modinfo_output) + + assert result is not None + assert isinstance(result, ModuleInfo) + assert result.filename == "/lib/modules/1.0.0-test/extra/amdgpu_dummy.ko.xz" + assert result.version == "10.20.30" + assert result.license == "GPL and additional rights" + assert result.description == "AMD GPU Test Module" + assert result.author == ["Test Developer"] + assert result.srcversion == "ABC123DEF456TEST789" + assert result.depends == ["test_dep1", "test_dep2", "test_dep3"] + assert result.name == "amdgpu" + assert result.vermagic == "1.0.0-test SMP mod_unload modversions" + assert result.sig_id == "TEST#1" + assert result.signer == "test_signer" + + +def test_parse_modinfo_with_parameters(linux_collector): + """Test parsing of modinfo output with parameters.""" + modinfo_output = """filename: /lib/modules/amdgpu_test.ko +name: amdgpu +parm: test_limit:Test limit parameter description (int) +parm: dummy_size:Dummy size parameter description (uint) +parm: fake_enable:Fake enable parameter description (int) +""" + result = linux_collector._parse_modinfo(modinfo_output) + + assert result is not None + assert len(result.parm) == 3 + + assert result.parm[0].name == "test_limit" + assert result.parm[0].type == "int" + assert result.parm[0].description == "Test limit parameter description" + + assert result.parm[1].name == "dummy_size" + assert result.parm[1].type == "uint" + assert result.parm[1].description == "Dummy size parameter description" + + assert result.parm[2].name == "fake_enable" + assert result.parm[2].type == "int" + assert result.parm[2].description == "Fake enable parameter description" + + +def test_parse_modinfo_with_firmware(linux_collector): + """Test parsing of modinfo output with firmware entries.""" + modinfo_output = """filename: /lib/modules/amdgpu_test.ko +name: amdgpu +firmware: amdgpu/test_firmware_v1.bin +firmware: amdgpu/dummy_firmware_v2.bin +firmware: amdgpu/fake_firmware_v3.bin +""" + result = linux_collector._parse_modinfo(modinfo_output) + + assert result is not None + assert len(result.firmware) == 3 + assert "amdgpu/test_firmware_v1.bin" in result.firmware + assert "amdgpu/dummy_firmware_v2.bin" in result.firmware + assert "amdgpu/fake_firmware_v3.bin" in result.firmware + + +def test_parse_modinfo_multiple_authors(linux_collector): + """Test parsing of modinfo output with multiple authors.""" + modinfo_output = """filename: /lib/modules/test.ko +author: Test Author One +author: Test Author Two +author: Test Author Three +""" + result = linux_collector._parse_modinfo(modinfo_output) + + assert result is not None + assert len(result.author) == 3 + assert result.author == ["Test Author One", "Test Author Two", "Test Author Three"] + + +def test_collect_data_with_modinfo(linux_collector): + """Test collect_data includes parsed modinfo data.""" + modinfo_output = """filename: /lib/modules/amdgpu_test.ko +version: 1.2.3 +name: amdgpu +""" + + seq = [ + make_artifact("cat /proc/modules", 0, "m1 0 0 - Live\n"), + make_artifact("ls /sys/module/m1/parameters", 1, ""), + make_artifact("modinfo amdgpu", 0, modinfo_output), + ] + linux_collector._run_sut_cmd = lambda cmd, seq=seq: seq.pop(0) + + result, data = linux_collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert isinstance(data, KernelModuleDataModel) + assert data.amdgpu_modinfo is not None + assert data.amdgpu_modinfo.version == "1.2.3" + assert data.amdgpu_modinfo.name == "amdgpu" + assert len(result.artifacts) == 1 + assert result.artifacts[0].filename == "modinfo_amdgpu.txt" + + +def test_collect_data_modinfo_not_available(linux_collector): + """Test collect_data when modinfo amdgpu fails.""" + seq = [ + make_artifact("cat /proc/modules", 0, "m1 0 0 - Live\n"), + make_artifact("ls /sys/module/m1/parameters", 1, ""), + make_artifact("modinfo amdgpu", 1, ""), + ] + linux_collector._run_sut_cmd = lambda cmd, seq=seq: seq.pop(0) + + result, data = linux_collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert isinstance(data, KernelModuleDataModel) + assert data.amdgpu_modinfo is None + assert len(result.artifacts) == 0 + warning_events = [e for e in result.events if e.priority == EventPriority.WARNING.value] + assert len(warning_events) > 0 + assert any("Could not collect modinfo amdgpu" in e.description for e in warning_events) + + +def test_parse_modinfo_comprehensive(linux_collector, modinfo_amdgpu_full): + """Test parsing of comprehensive dummy modinfo amdgpu output.""" + result = linux_collector._parse_modinfo(modinfo_amdgpu_full) + + assert result is not None + assert isinstance(result, ModuleInfo) + + assert result.filename == "/lib/modules/1.2.3-test-kernel/extra/amdgpu_test.ko.xz" + assert result.version == "99.88.77" + assert result.license == "GPL and additional rights" + assert result.description == "AMD GPU Test Module" + assert result.author == ["Test Author One"] + assert result.srcversion == "ABCD1234567890TESTVER" + + assert len(result.firmware) == 12 + assert "amdgpu/test_gpu_info_v1.bin" in result.firmware + assert "amdgpu/dummy_gpu_info_a.bin" in result.firmware + assert "amdgpu/fake_gpu_info_i.bin" in result.firmware + + expected_depends = [ + "dummy_dep1", + "dummy_dep2", + "test_dep3", + "fake_dep4", + "sample_dep5", + "example_dep6", + "mock_dep7", + ] + assert result.depends == expected_depends + + assert result.name == "amdgpu" + assert result.vermagic == "1.2.3-test-kernel SMP mod_unload modversions" + assert result.sig_id == "TEST#99" + assert result.signer == "Test Signing Authority (dummy key 1)" + assert result.sig_key == "00:11:22:33:44:55:66:77:88:99:AA:BB:CC:DD:EE:FF:00:11:22:33" + assert result.sig_hashalgo == "test256" + + assert len(result.parm) == 53 + + test_param1 = next((p for p in result.parm if p.name == "test_param1"), None) + assert test_param1 is not None + assert test_param1.type == "int" + assert test_param1.description == "Test parameter description one" + + test_dpm = next((p for p in result.parm if p.name == "test_dpm"), None) + assert test_dpm is not None + assert test_dpm.type == "int" + assert test_dpm.description == "Test DPM parameter" + + test_longtraining = next((p for p in result.parm if p.name == "test_longtraining"), None) + assert test_longtraining is not None + assert test_longtraining.type == "bool" + assert "Test long training" in test_longtraining.description + + example_disable_cu = next((p for p in result.parm if p.name == "example_disable_cu"), None) + assert example_disable_cu is not None + assert example_disable_cu.type == "charp" + assert "Example disable CU" in example_disable_cu.description + + +def test_collect_data_with_full_modinfo(linux_collector, modinfo_amdgpu_full): + """Test collect_data with comprehensive dummy modinfo data.""" + seq = [ + make_artifact("cat /proc/modules", 0, "amdgpu 16384 0 - Live\n"), + make_artifact("ls /sys/module/amdgpu/parameters", 1, ""), + make_artifact("modinfo amdgpu", 0, modinfo_amdgpu_full), + ] + linux_collector._run_sut_cmd = lambda cmd, seq=seq: seq.pop(0) + + result, data = linux_collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert isinstance(data, KernelModuleDataModel) + assert data.amdgpu_modinfo is not None + assert data.amdgpu_modinfo.version == "99.88.77" + assert data.amdgpu_modinfo.name == "amdgpu" + assert len(data.amdgpu_modinfo.firmware) == 12 + assert len(data.amdgpu_modinfo.parm) == 53 + assert len(data.amdgpu_modinfo.depends) == 7 + assert len(result.artifacts) == 1 + assert result.artifacts[0].filename == "modinfo_amdgpu.txt" + assert result.artifacts[0].contents == modinfo_amdgpu_full diff --git a/test/unit/plugin/test_memory_collector.py b/test/unit/plugin/test_memory_collector.py index 7b194bb4..dfdc53d7 100644 --- a/test/unit/plugin/test_memory_collector.py +++ b/test/unit/plugin/test_memory_collector.py @@ -31,7 +31,6 @@ from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.models.systeminfo import OSFamily from nodescraper.plugins.inband.memory.memory_collector import MemoryCollector -from nodescraper.plugins.inband.memory.memorydata import MemoryDataModel @pytest.fixture @@ -44,24 +43,73 @@ def collector(system_info, conn_mock): def test_run_linux(collector, conn_mock): - conn_mock.run_command.return_value = CommandArtifact( - exit_code=0, - stdout=( - " total used free shared buff/cache available\n" - "Mem: 2164113772544 31750934528 2097459761152 893313024 34903076864 2122320150528\n" - "Swap: 8589930496 0 8589930496" - ), - stderr="", - command="free -h", - ) + def mock_run_command(command, **kwargs): + if "free" in command: + return CommandArtifact( + exit_code=0, + stdout=( + " total used free shared buff/cache available\n" + "Mem: 2164113772544 31750934528 2097459761152 893313024 34903076864 2122320150528\n" + "Swap: 8589930496 0 8589930496" + ), + stderr="", + command="free -b", + ) + elif "lsmem" in command: + return CommandArtifact( + exit_code=0, + stdout=( + "RANGE SIZE STATE REMOVABLE BLOCK\n" + "0x0000000000000000-0x000000007fffffff 2G online yes 0-15\n" + "0x0000000100000000-0x000000207fffffff 126G online yes 32-2047\n" + "\n" + "Memory block size: 128M\n" + "Total online memory: 128G\n" + "Total offline memory: 0B\n" + ), + stderr="", + command="lsmem", + ) + elif "numactl" in command: + return CommandArtifact( + exit_code=0, + stdout=( + "available: 2 nodes (0-1)\n" + "node 0 cpus: 0 1 2 3 4 5 6 7\n" + "node 0 size: 32768 MB\n" + "node 0 free: 16384 MB\n" + "node 1 cpus: 8 9 10 11 12 13 14 15\n" + "node 1 size: 32768 MB\n" + "node 1 free: 20000 MB\n" + "node distances:\n" + "node 0 1\n" + " 0: 10 21\n" + " 1: 21 10" + ), + stderr="", + command="numactl -H", + ) + return CommandArtifact(exit_code=1, stdout="", stderr="", command=command) + + conn_mock.run_command.side_effect = mock_run_command result, data = collector.collect_data() assert result.status == ExecutionStatus.OK - assert data == MemoryDataModel( - mem_free="2097459761152", - mem_total="2164113772544", - ) + assert data.mem_free == "2097459761152" + assert data.mem_total == "2164113772544" + assert data.lsmem_data is not None + assert len(data.lsmem_data.memory_blocks) == 2 + assert data.lsmem_data.memory_blocks[0].range == "0x0000000000000000-0x000000007fffffff" + assert data.lsmem_data.memory_blocks[0].size == "2G" + assert data.lsmem_data.memory_blocks[0].state == "online" + assert data.lsmem_data.summary.memory_block_size == "128M" + assert data.lsmem_data.summary.total_online_memory == "128G" + assert data.numa_topology is not None + assert len(data.numa_topology.nodes) == 2 + assert data.numa_topology.nodes[0].node_id == 0 + assert data.numa_topology.nodes[0].memory_size_mb == 32768 + assert data.numa_topology.distance_matrix[0][1] == 21 def test_run_windows(collector, conn_mock): @@ -76,10 +124,54 @@ def test_run_windows(collector, conn_mock): result, data = collector.collect_data() assert result.status == ExecutionStatus.OK - assert data == MemoryDataModel( - mem_free="12345678", - mem_total="123412341234", - ) + assert data.mem_free == "12345678" + assert data.mem_total == "123412341234" + assert data.lsmem_data is None + assert conn_mock.run_command.call_count == 1 + + +def test_run_linux_lsmem_fails(collector, conn_mock): + def mock_run_command(command, **kwargs): + if "free" in command: + return CommandArtifact( + exit_code=0, + stdout=( + " total used free shared buff/cache available\n" + "Mem: 2164113772544 31750934528 2097459761152 893313024 34903076864 2122320150528\n" + "Swap: 8589930496 0 8589930496" + ), + stderr="", + command="free -b", + ) + elif "lsmem" in command: + return CommandArtifact( + exit_code=127, + stdout="", + stderr="lsmem: command not found", + command="lsmem", + ) + elif "numactl" in command: + return CommandArtifact( + exit_code=127, + stdout="", + stderr="numactl: command not found", + command="numactl -H", + ) + return CommandArtifact(exit_code=1, stdout="", stderr="", command=command) + + conn_mock.run_command.side_effect = mock_run_command + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data.mem_free == "2097459761152" + assert data.mem_total == "2164113772544" + assert data.lsmem_data is None + assert data.numa_topology is None + lsmem_events = [e for e in result.events if "lsmem" in e.description] + assert len(lsmem_events) > 0 + numactl_events = [e for e in result.events if "numactl" in e.description] + assert len(numactl_events) > 0 def test_run_error(collector, conn_mock): @@ -101,3 +193,183 @@ def test_run_error(collector, conn_mock): assert data is None assert result.events[0].category == EventCategory.OS.value assert result.events[0].description == "Error checking available and total memory" + + +def test_parse_lsmem_output(collector): + """Test parsing of lsmem command output.""" + lsmem_output = ( + "RANGE SIZE STATE REMOVABLE BLOCK\n" + "0x0000000000000000-0x000000007fffffff 2G online yes 0-15\n" + "0x0000000100000000-0x000000207fffffff 126G online yes 32-2047\n" + "0x0000002080000000-0x000000407fffffff 126G online no 2048-4095\n" + "\n" + "Memory block size: 128M\n" + "Total online memory: 254G\n" + "Total offline memory: 0B\n" + ) + + result = collector._parse_lsmem_output(lsmem_output) + + assert result is not None + assert len(result.memory_blocks) == 3 + + assert result.memory_blocks[0].range == "0x0000000000000000-0x000000007fffffff" + assert result.memory_blocks[0].size == "2G" + assert result.memory_blocks[0].state == "online" + assert result.memory_blocks[0].removable == "yes" + assert result.memory_blocks[0].block == "0-15" + + assert result.memory_blocks[1].range == "0x0000000100000000-0x000000207fffffff" + assert result.memory_blocks[1].size == "126G" + assert result.memory_blocks[1].state == "online" + + assert result.memory_blocks[2].removable == "no" + assert result.memory_blocks[2].block == "2048-4095" + + assert result.summary.memory_block_size == "128M" + assert result.summary.total_online_memory == "254G" + assert result.summary.total_offline_memory == "0B" + + +def test_parse_lsmem_output_no_blocks(collector): + """Test parsing of lsmem output with no memory blocks.""" + lsmem_output = ( + "RANGE SIZE STATE REMOVABLE BLOCK\n" + "\n" + "Memory block size: 128M\n" + "Total online memory: 0G\n" + "Total offline memory: 0B\n" + ) + + result = collector._parse_lsmem_output(lsmem_output) + + assert result is None + + +def test_parse_lsmem_output_empty(collector): + """Test parsing of empty lsmem output.""" + result = collector._parse_lsmem_output("") + assert result is None + + +def test_parse_numactl_hardware_two_nodes(collector): + """Test parsing of numactl -H output with 2 NUMA nodes.""" + numactl_output = """available: 2 nodes (0-1) +node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +node 0 size: 32768 MB +node 0 free: 15234 MB +node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +node 1 size: 32768 MB +node 1 free: 20145 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10""" + + result = collector._parse_numactl_hardware(numactl_output) + + assert result is not None + assert result.available_nodes == [0, 1] + assert len(result.nodes) == 2 + + # Check node 0 + assert result.nodes[0].node_id == 0 + assert result.nodes[0].cpus == list(range(16)) + assert result.nodes[0].memory_size_mb == 32768 + assert result.nodes[0].memory_free_mb == 15234 + + # Check node 1 + assert result.nodes[1].node_id == 1 + assert result.nodes[1].cpus == list(range(16, 32)) + assert result.nodes[1].memory_size_mb == 32768 + assert result.nodes[1].memory_free_mb == 20145 + + # Check distances + assert len(result.distances) == 4 + assert result.distance_matrix is not None + assert result.distance_matrix[0][0] == 10 + assert result.distance_matrix[0][1] == 21 + assert result.distance_matrix[1][0] == 21 + assert result.distance_matrix[1][1] == 10 + + +def test_parse_numactl_hardware_single_node(collector): + """Test parsing of numactl -H output with single NUMA node.""" + numactl_output = """available: 1 nodes (0) +node 0 cpus: 0 1 2 3 4 5 6 7 +node 0 size: 16384 MB +node 0 free: 8192 MB +node distances: +node 0 + 0: 10""" + + result = collector._parse_numactl_hardware(numactl_output) + + assert result is not None + assert result.available_nodes == [0] + assert len(result.nodes) == 1 + assert result.nodes[0].node_id == 0 + assert result.nodes[0].cpus == [0, 1, 2, 3, 4, 5, 6, 7] + assert result.nodes[0].memory_size_mb == 16384 + assert result.nodes[0].memory_free_mb == 8192 + assert len(result.distances) == 1 + assert result.distance_matrix[0][0] == 10 + + +def test_parse_numactl_hardware_no_memory_info(collector): + """Test parsing of numactl -H output without memory size/free info.""" + numactl_output = """available: 2 nodes (0-1) +node 0 cpus: 0 1 2 3 +node 1 cpus: 4 5 6 7 +node distances: +node 0 1 + 0: 10 21 + 1: 21 10""" + + result = collector._parse_numactl_hardware(numactl_output) + + assert result is not None + assert len(result.nodes) == 2 + assert result.nodes[0].memory_size_mb is None + assert result.nodes[0].memory_free_mb is None + assert result.nodes[1].memory_size_mb is None + assert result.nodes[1].memory_free_mb is None + + +def test_parse_numactl_hardware_empty_output(collector): + """Test parsing of empty numactl output.""" + result = collector._parse_numactl_hardware("") + assert result is None + + +def test_parse_numactl_hardware_four_nodes(collector): + """Test parsing of numactl -H output with 4 NUMA nodes.""" + numactl_output = """available: 4 nodes (0-3) +node 0 cpus: 0 1 2 3 +node 0 size: 8192 MB +node 0 free: 4096 MB +node 1 cpus: 4 5 6 7 +node 1 size: 8192 MB +node 1 free: 3000 MB +node 2 cpus: 8 9 10 11 +node 2 size: 8192 MB +node 2 free: 5000 MB +node 3 cpus: 12 13 14 15 +node 3 size: 8192 MB +node 3 free: 6000 MB +node distances: +node 0 1 2 3 + 0: 10 21 21 21 + 1: 21 10 21 21 + 2: 21 21 10 21 + 3: 21 21 21 10""" + + result = collector._parse_numactl_hardware(numactl_output) + + assert result is not None + assert result.available_nodes == [0, 1, 2, 3] + assert len(result.nodes) == 4 + assert len(result.distances) == 16 + assert result.distance_matrix[0][0] == 10 + assert result.distance_matrix[0][3] == 21 + assert result.distance_matrix[3][3] == 10 diff --git a/test/unit/plugin/test_network_collector.py b/test/unit/plugin/test_network_collector.py new file mode 100644 index 00000000..9d7e7546 --- /dev/null +++ b/test/unit/plugin/test_network_collector.py @@ -0,0 +1,544 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from unittest.mock import MagicMock + +import pytest + +from nodescraper.enums.executionstatus import ExecutionStatus +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.models.systeminfo import OSFamily +from nodescraper.plugins.inband.network.network_collector import NetworkCollector +from nodescraper.plugins.inband.network.networkdata import ( + EthtoolInfo, + IpAddress, + Neighbor, + NetworkDataModel, + NetworkInterface, + Route, + RoutingRule, +) + + +@pytest.fixture +def collector(system_info, conn_mock): + return NetworkCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + + +# Sample command outputs for testing (mock data) +IP_ADDR_OUTPUT = """1: lo: mtu 12345 qdisc noqueue state UNKNOWN group default qlen 1000 + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + inet6 ::1/128 scope host + valid_lft forever preferred_lft forever +2: eth0: mtu 5678 qdisc mq state UP group default qlen 1000 + link/ether aa:bb:cc:dd:ee:ff brd ff:ff:ff:ff:ff:ff + inet 1.123.123.100/24 brd 1.123.123.255 scope global noprefixroute eth0 + valid_lft forever preferred_lft forever + inet6 fe80::aabb:ccff/64 scope link + valid_lft forever preferred_lft forever""" + +IP_ROUTE_OUTPUT = """default via 2.123.123.1 dev eth0 proto static metric 100 +2.123.123.0/24 dev eth0 proto kernel scope link src 2.123.123.100 metric 100 +7.8.0.0/16 dev docker0 proto kernel scope link src 7.8.0.1 linkdown""" + +IP_RULE_OUTPUT = """0: from all lookup local +89145: from all lookup main +56789: from all lookup default""" + +IP_NEIGHBOR_OUTPUT = """50.50.1.50 dev eth0 lladdr 11:22:33:44:55:66 STALE +50.50.1.1 dev eth0 lladdr 99:88:77:66:55:44 REACHABLE""" + +ETHTOOL_OUTPUT = """Settings for ethmock123: + Supported ports: [ TP ] + Supported link modes: 10mockbaseT/Half + 123mockbaseT/Half + 1234mockbaseT/Full + Supported pause frame use: Symmetric + Supports auto-negotiation: Yes + Supported FEC modes: Not reported + Advertised link modes: 10mockbaseT/Half 10mockbaseT/Full + 167mockbaseT/Half 167mockbaseT/Full + 1345mockbaseT/Full + Advertised pause frame use: Symmetric + Advertised auto-negotiation: Yes + Advertised FEC modes: Xyz ABCfec + Speed: 1000mockMb/s + Duplex: Full + Port: MockedTwisted Pair + PHYAD: 1 + Transceiver: internal + Auto-negotiation: on + MDI-X: on (auto) + Supports Wake-on: qwerty + Wake-on: g + Current message level: 0x123123 + Link detected: yes""" + +ETHTOOL_NO_LINK_OUTPUT = """Settings for ethmock1: + Supported ports: [ FIBRE ] + Supported link modes: 11122mockbaseT/Full + Speed: Unknown! + Duplex: Unknown! + Port: FIBRE + Auto-negotiation: off + Link detected: no""" + + +def test_parse_ip_addr_loopback(collector): + """Test parsing loopback interface from ip addr output""" + interfaces = collector._parse_ip_addr(IP_ADDR_OUTPUT) + + # Find loopback interface + lo = next((i for i in interfaces if i.name == "lo"), None) + assert lo is not None + assert lo.index == 1 + assert lo.state == "UNKNOWN" + assert lo.mtu == 12345 + assert lo.qdisc == "noqueue" + assert lo.mac_address == "00:00:00:00:00:00" + assert "LOOPBACK" in lo.flags + assert "UP" in lo.flags + + # Check addresses + assert len(lo.addresses) == 2 + ipv4 = next((a for a in lo.addresses if a.family == "inet"), None) + assert ipv4 is not None + assert ipv4.address == "127.0.0.1" + assert ipv4.prefix_len == 8 + assert ipv4.scope == "host" + + +def test_parse_ip_addr_ethernet(collector): + """Test parsing ethernet interface from ip addr output""" + interfaces = collector._parse_ip_addr(IP_ADDR_OUTPUT) + + # Find ethernet interface + eth = next((i for i in interfaces if i.name == "eth0"), None) + assert eth is not None + assert eth.index == 2 + assert eth.state == "UP" + assert eth.mtu == 5678 + assert eth.qdisc == "mq" + assert eth.mac_address == "aa:bb:cc:dd:ee:ff" + assert "BROADCAST" in eth.flags + assert "MULTICAST" in eth.flags + + # Check IPv4 address + ipv4 = next((a for a in eth.addresses if a.family == "inet"), None) + assert ipv4 is not None + assert ipv4.address == "1.123.123.100" + assert ipv4.prefix_len == 24 + assert ipv4.broadcast == "1.123.123.255" + assert ipv4.scope == "global" + + +def test_parse_ip_route_default(collector): + """Test parsing default route""" + routes = collector._parse_ip_route(IP_ROUTE_OUTPUT) + + # Find default route + default_route = next((r for r in routes if r.destination == "default"), None) + assert default_route is not None + assert default_route.gateway == "2.123.123.1" + assert default_route.device == "eth0" + assert default_route.protocol == "static" + assert default_route.metric == 100 + + +def test_parse_ip_route_network(collector): + """Test parsing network route with source""" + routes = collector._parse_ip_route(IP_ROUTE_OUTPUT) + + # Find network route + net_route = next((r for r in routes if r.destination == "2.123.123.0/24"), None) + assert net_route is not None + assert net_route.gateway is None # Direct route, no gateway + assert net_route.device == "eth0" + assert net_route.protocol == "kernel" + assert net_route.scope == "link" + assert net_route.source == "2.123.123.100" + assert net_route.metric == 100 + + +def test_parse_ip_route_docker(collector): + """Test parsing docker bridge route""" + routes = collector._parse_ip_route(IP_ROUTE_OUTPUT) + + # Find docker route + docker_route = next((r for r in routes if r.destination == "7.8.0.0/16"), None) + assert docker_route is not None + assert docker_route.gateway is None + assert docker_route.device == "docker0" + assert docker_route.protocol == "kernel" + assert docker_route.scope == "link" + assert docker_route.source == "7.8.0.1" + + +def test_parse_ip_rule_basic(collector): + """Test parsing routing rules""" + rules = collector._parse_ip_rule(IP_RULE_OUTPUT) + + assert len(rules) == 3 + + # Check local rule + local_rule = next((r for r in rules if r.priority == 0), None) + assert local_rule is not None + assert local_rule.source is None # "from all" + assert local_rule.destination is None + assert local_rule.table == "local" + assert local_rule.action == "lookup" + + # Check main rule + main_rule = next((r for r in rules if r.priority == 89145), None) + assert main_rule is not None + assert main_rule.table == "main" + + # Check default rule + default_rule = next((r for r in rules if r.priority == 56789), None) + assert default_rule is not None + assert default_rule.table == "default" + + +def test_parse_ip_rule_complex(collector): + """Test parsing complex routing rule with all fields""" + complex_rule_output = ( + "100: from 192.168.1.0/24 to 10.0.0.0/8 iif eth0 oif eth1 fwmark 0x10 lookup custom_table" + ) + + rules = collector._parse_ip_rule(complex_rule_output) + + assert len(rules) == 1 + rule = rules[0] + assert rule.priority == 100 + assert rule.source == "192.168.1.0/24" + assert rule.destination == "10.0.0.0/8" + assert rule.iif == "eth0" + assert rule.oif == "eth1" + assert rule.fwmark == "0x10" + assert rule.table == "custom_table" + assert rule.action == "lookup" + + +def test_parse_ip_neighbor_reachable(collector): + """Test parsing neighbor entries""" + neighbors = collector._parse_ip_neighbor(IP_NEIGHBOR_OUTPUT) + + # Check REACHABLE neighbor + reachable = next((n for n in neighbors if n.state == "REACHABLE"), None) + assert reachable is not None + assert reachable.ip_address == "50.50.1.1" + assert reachable.device == "eth0" + assert reachable.mac_address == "99:88:77:66:55:44" + assert reachable.state == "REACHABLE" + + +def test_parse_ip_neighbor_stale(collector): + """Test parsing STALE neighbor entry""" + neighbors = collector._parse_ip_neighbor(IP_NEIGHBOR_OUTPUT) + + # Check STALE neighbor + stale = next((n for n in neighbors if n.state == "STALE"), None) + assert stale is not None + assert stale.ip_address == "50.50.1.50" + assert stale.device == "eth0" + assert stale.mac_address == "11:22:33:44:55:66" + assert stale.state == "STALE" + + +def test_parse_ip_neighbor_with_flags(collector): + """Test parsing neighbor with flags""" + neighbor_with_flags = "10.0.0.1 dev eth0 lladdr aa:bb:cc:dd:ee:ff REACHABLE router proxy" + + neighbors = collector._parse_ip_neighbor(neighbor_with_flags) + + assert len(neighbors) == 1 + neighbor = neighbors[0] + assert neighbor.ip_address == "10.0.0.1" + assert neighbor.mac_address == "aa:bb:cc:dd:ee:ff" + assert neighbor.state == "REACHABLE" + assert "router" in neighbor.flags + assert "proxy" in neighbor.flags + + +def test_collect_data_success(collector, conn_mock): + """Test successful collection of all network data""" + collector.system_info.os_family = OSFamily.LINUX + + # Mock successful command execution + def run_sut_cmd_side_effect(cmd): + if "addr show" in cmd: + return MagicMock(exit_code=0, stdout=IP_ADDR_OUTPUT, command=cmd) + elif "route show" in cmd: + return MagicMock(exit_code=0, stdout=IP_ROUTE_OUTPUT, command=cmd) + elif "rule show" in cmd: + return MagicMock(exit_code=0, stdout=IP_RULE_OUTPUT, command=cmd) + elif "neighbor show" in cmd: + return MagicMock(exit_code=0, stdout=IP_NEIGHBOR_OUTPUT, command=cmd) + elif "ethtool" in cmd: + # Fail ethtool commands (simulating no sudo or not supported) + return MagicMock(exit_code=1, stdout="", command=cmd) + return MagicMock(exit_code=1, stdout="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert isinstance(data, NetworkDataModel) + assert len(data.interfaces) == 2 + assert len(data.routes) == 3 + assert len(data.rules) == 3 + assert len(data.neighbors) == 2 + assert "2 interfaces" in result.message + assert "3 routes" in result.message + assert "3 rules" in result.message + assert "2 neighbors" in result.message + assert "ethtool" in result.message + + +def test_collect_data_addr_failure(collector, conn_mock): + """Test collection when ip addr command fails""" + collector.system_info.os_family = OSFamily.LINUX + + # Mock failed addr command but successful others + def run_sut_cmd_side_effect(cmd): + if "addr show" in cmd: + return MagicMock(exit_code=1, stdout="", command=cmd) + elif "route show" in cmd: + return MagicMock(exit_code=0, stdout=IP_ROUTE_OUTPUT, command=cmd) + elif "rule show" in cmd: + return MagicMock(exit_code=0, stdout=IP_RULE_OUTPUT, command=cmd) + elif "neighbor show" in cmd: + return MagicMock(exit_code=0, stdout=IP_NEIGHBOR_OUTPUT, command=cmd) + elif "ethtool" in cmd: + return MagicMock(exit_code=1, stdout="", command=cmd) + return MagicMock(exit_code=1, stdout="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + # Should still return data from successful commands + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.interfaces) == 0 # Failed + assert len(data.routes) == 3 # Success + assert len(data.rules) == 3 # Success + assert len(data.neighbors) == 2 # Success + assert len(data.ethtool_info) == 0 # No interfaces, so no ethtool data + assert len(result.events) > 0 + + +def test_collect_data_all_failures(collector, conn_mock): + """Test collection when all commands fail""" + collector.system_info.os_family = OSFamily.LINUX + + # Mock all commands failing (including ethtool) + def run_sut_cmd_side_effect(cmd): + return MagicMock(exit_code=1, stdout="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.ERROR + assert data is None + assert len(result.events) > 0 + + +def test_parse_empty_output(collector): + """Test parsing empty command output""" + interfaces = collector._parse_ip_addr("") + routes = collector._parse_ip_route("") + rules = collector._parse_ip_rule("") + neighbors = collector._parse_ip_neighbor("") + + assert len(interfaces) == 0 + assert len(routes) == 0 + assert len(rules) == 0 + assert len(neighbors) == 0 + + +def test_parse_malformed_output(collector): + """Test parsing malformed output gracefully""" + malformed = "this is not valid ip output\nsome random text\n123 456" + + # Should not crash, just return empty or skip bad lines + interfaces = collector._parse_ip_addr(malformed) + routes = collector._parse_ip_route(malformed) + neighbors = collector._parse_ip_neighbor(malformed) + + # Parser should handle gracefully + assert isinstance(interfaces, list) + assert isinstance(routes, list) + assert isinstance(neighbors, list) + + +def test_parse_ip_addr_ipv6_only(collector): + """Test parsing interface with only IPv6 address""" + ipv6_only = """3: eth1: mtu 1500 qdisc pfifo_fast state UP qlen 1000 + link/ether aa:bb:cc:dd:ee:ff brd ff:ff:ff:ff:ff:ff + inet6 fe80::a8bb:ccff:fedd:eeff/64 scope link + valid_lft forever preferred_lft forever""" + + interfaces = collector._parse_ip_addr(ipv6_only) + + assert len(interfaces) == 1 + eth1 = interfaces[0] + assert eth1.name == "eth1" + assert len(eth1.addresses) == 1 + assert eth1.addresses[0].family == "inet6" + assert eth1.addresses[0].address == "fe80::a8bb:ccff:fedd:eeff" + assert eth1.addresses[0].prefix_len == 64 + + +def test_parse_ip_rule_with_action(collector): + """Test parsing rule with unreachable action""" + rule_with_action = "200: from 10.0.0.5 unreachable" + + rules = collector._parse_ip_rule(rule_with_action) + + assert len(rules) == 1 + rule = rules[0] + assert rule.priority == 200 + assert rule.source == "10.0.0.5" + assert rule.action == "unreachable" + assert rule.table is None + + +def test_parse_ethtool_basic(collector): + """Test parsing basic ethtool output""" + ethtool_info = collector._parse_ethtool("ethmock123", ETHTOOL_OUTPUT) + + assert ethtool_info.interface == "ethmock123" + assert ethtool_info.speed == "1000mockMb/s" + assert ethtool_info.duplex == "Full" + assert ethtool_info.port == "MockedTwisted Pair" + assert ethtool_info.auto_negotiation == "on" + assert ethtool_info.link_detected == "yes" + assert "Speed" in ethtool_info.settings + assert ethtool_info.settings["Speed"] == "1000mockMb/s" + assert ethtool_info.settings["PHYAD"] == "1" + assert ethtool_info.raw_output == ETHTOOL_OUTPUT + + +def test_parse_ethtool_supported_link_modes(collector): + """Test parsing supported link modes from ethtool output""" + ethtool_info = collector._parse_ethtool("ethmock123", ETHTOOL_OUTPUT) + + # Check supported link modes are stored in settings dict + # Note: The current implementation stores link modes in settings dict, + # not in the supported_link_modes list + assert "Supported link modes" in ethtool_info.settings + assert "10mockbaseT/Half" in ethtool_info.settings["Supported link modes"] + + +def test_parse_ethtool_advertised_link_modes(collector): + """Test parsing advertised link modes from ethtool output""" + ethtool_info = collector._parse_ethtool("ethmock123", ETHTOOL_OUTPUT) + + # Check advertised link modes are stored in settings dict + # Note: The current implementation stores link modes in settings dict, + # not in the advertised_link_modes list + assert "Advertised link modes" in ethtool_info.settings + assert "10mockbaseT/Half" in ethtool_info.settings["Advertised link modes"] + assert "10mockbaseT/Full" in ethtool_info.settings["Advertised link modes"] + + +def test_parse_ethtool_no_link(collector): + """Test parsing ethtool output when link is down""" + ethtool_info = collector._parse_ethtool("ethmock1", ETHTOOL_NO_LINK_OUTPUT) + + assert ethtool_info.interface == "ethmock1" + assert ethtool_info.speed == "Unknown!" + assert ethtool_info.duplex == "Unknown!" + assert ethtool_info.port == "FIBRE" + assert ethtool_info.auto_negotiation == "off" + assert ethtool_info.link_detected == "no" + # Check supported link modes are stored in settings dict + assert "Supported link modes" in ethtool_info.settings + assert "11122mockbaseT/Full" in ethtool_info.settings["Supported link modes"] + + +def test_parse_ethtool_empty_output(collector): + """Test parsing empty ethtool output""" + ethtool_info = collector._parse_ethtool("eth0", "") + + assert ethtool_info.interface == "eth0" + assert ethtool_info.speed is None + assert ethtool_info.duplex is None + assert ethtool_info.link_detected is None + assert len(ethtool_info.settings) == 0 + assert len(ethtool_info.supported_link_modes) == 0 + assert len(ethtool_info.advertised_link_modes) == 0 + + +def test_network_data_model_creation(collector): + """Test creating NetworkDataModel with all components""" + interface = NetworkInterface( + name="ethmock123", + index=1, + state="UP", + mtu=5678, + addresses=[IpAddress(address="1.123.123.100", prefix_len=24, family="inet")], + ) + + route = Route(destination="default", gateway="2.123.123.1", device="ethmock123") + + rule = RoutingRule(priority=100, source="1.123.123.0/24", table="main") + + neighbor = Neighbor( + ip_address="50.50.1.1", + device="ethmock123", + mac_address="11:22:33:44:55:66", + state="REACHABLE", + ) + + ethtool_info = EthtoolInfo( + interface="ethmock123", raw_output=ETHTOOL_OUTPUT, speed="1000mockMb/s", duplex="Full" + ) + + data = NetworkDataModel( + interfaces=[interface], + routes=[route], + rules=[rule], + neighbors=[neighbor], + ethtool_info={"ethmock123": ethtool_info}, + ) + + assert len(data.interfaces) == 1 + assert len(data.routes) == 1 + assert len(data.rules) == 1 + assert len(data.neighbors) == 1 + assert len(data.ethtool_info) == 1 + assert data.interfaces[0].name == "ethmock123" + assert data.ethtool_info["ethmock123"].speed == "1000mockMb/s" diff --git a/test/unit/plugin/test_package_analyzer.py b/test/unit/plugin/test_package_analyzer.py index f9401cc9..5027f6e1 100644 --- a/test/unit/plugin/test_package_analyzer.py +++ b/test/unit/plugin/test_package_analyzer.py @@ -89,4 +89,22 @@ def test_data_version_regex(package_analyzer, default_data_lib): regex_match=True, ) res = package_analyzer.analyze_data(default_data_lib, args=args) + assert res.status == ExecutionStatus.OK + assert res.message == "All packages found and versions matched" + + +def test_data_multiple_errors_regex(package_analyzer, default_data_lib): + """Test that detailed error messages are shown for multiple package errors""" + args = PackageAnalyzerArgs( + exp_package_ver={ + "missing-package": None, + "test-ubuntu-package\\.x86_64": "2\\.\\d+", + "another-missing": "1\\.0", + }, + regex_match=True, + ) + res = package_analyzer.analyze_data(default_data_lib, args=args) assert res.status == ExecutionStatus.ERROR + assert "missing-package" in res.message + assert "another-missing" in res.message + assert len(res.events) == 3 diff --git a/test/unit/plugin/test_package_collector.py b/test/unit/plugin/test_package_collector.py index 8cf6f2fa..b83755fb 100644 --- a/test/unit/plugin/test_package_collector.py +++ b/test/unit/plugin/test_package_collector.py @@ -222,3 +222,112 @@ def test_bad_splits_ubuntu(collector, conn_mock, command_results): ] res, _ = collector.collect_data() assert res.status == ExecutionStatus.OK + + +def test_rocm_package_filtering_custom_regex(collector, conn_mock, command_results): + """Test ROCm package filtering with custom regex pattern.""" + from nodescraper.plugins.inband.package.analyzer_args import PackageAnalyzerArgs + + # Mock Ubuntu system with ROCm packages + ubuntu_packages = """rocm-core 5.7.0 + hip-runtime-amd 5.7.0 + hsa-rocr 1.9.0 + amdgpu-dkms 6.3.6 + gcc 11.4.0 + python3 3.10.12""" + + conn_mock.run_command.side_effect = [ + CommandArtifact( + command="", + exit_code=0, + stdout=command_results["ubuntu_rel"], + stderr="", + ), + CommandArtifact( + command="", + exit_code=0, + stdout=ubuntu_packages, + stderr="", + ), + ] + + # Use custom regex that only matches 'rocm' and 'hip' + args = PackageAnalyzerArgs(rocm_regex="rocm|hip") + res, data = collector.collect_data(args) + assert res.status == ExecutionStatus.OK + # Check that ROCm packages are found + assert "found 2 rocm-related packages" in res.message.lower() + assert data is not None + + +def test_rocm_package_filtering_no_matches(collector, conn_mock, command_results): + """Test ROCm package filtering when no ROCm packages are installed.""" + from nodescraper.plugins.inband.package.analyzer_args import PackageAnalyzerArgs + + # Mock Ubuntu system without ROCm packages + ubuntu_packages = """gcc 11.4.0 + python3 3.10.12 + vim 8.2.3995""" + + conn_mock.run_command.side_effect = [ + CommandArtifact( + command="", + exit_code=0, + stdout=command_results["ubuntu_rel"], + stderr="", + ), + CommandArtifact( + command="", + exit_code=0, + stdout=ubuntu_packages, + stderr="", + ), + ] + + args = PackageAnalyzerArgs(rocm_regex="rocm|hip|hsa") + res, data = collector.collect_data(args) + assert res.status == ExecutionStatus.OK + # No ROCm packages found, so message should not mention them + assert "rocm" not in res.message.lower() or res.message == "" + assert data is not None + assert len(data.version_info) == 3 + + +def test_filter_rocm_packages_method(collector): + """Test _filter_rocm_packages method directly.""" + packages = { + "rocm-core": "5.7.0", + "hip-runtime-amd": "5.7.0", + "hsa-rocr": "1.9.0", + "amdgpu-dkms": "6.3.6", + "gcc": "11.4.0", + "python3": "3.10.12", + } + + # Test with default-like pattern + rocm_pattern = "rocm|hip|hsa|amdgpu" + filtered = collector._filter_rocm_packages(packages, rocm_pattern) + + assert len(filtered) == 4 + assert "rocm-core" in filtered + assert "hip-runtime-amd" in filtered + assert "hsa-rocr" in filtered + assert "amdgpu-dkms" in filtered + assert "gcc" not in filtered + assert "python3" not in filtered + + +def test_filter_rocm_packages_case_insensitive(collector): + """Test that ROCm package filtering is case-insensitive.""" + packages = { + "ROCM-Core": "5.7.0", + "HIP-Runtime-AMD": "5.7.0", + "gcc": "11.4.0", + } + + rocm_pattern = "rocm|hip" + filtered = collector._filter_rocm_packages(packages, rocm_pattern) + + assert len(filtered) == 2 + assert "ROCM-Core" in filtered + assert "HIP-Runtime-AMD" in filtered diff --git a/test/unit/plugin/test_rocm_analyzer.py b/test/unit/plugin/test_rocm_analyzer.py index 18afed9d..9ecc7fb4 100644 --- a/test/unit/plugin/test_rocm_analyzer.py +++ b/test/unit/plugin/test_rocm_analyzer.py @@ -42,7 +42,7 @@ def analyzer(system_info): @pytest.fixture def model_obj(): - return RocmDataModel(rocm_version="6.2.0-66") + return RocmDataModel(rocm_version="6.2.0-66", rocm_latest_versioned_path="/opt/rocm-7.1.0") @pytest.fixture @@ -50,14 +50,16 @@ def config(): return { "rocm_version": ["6.2.0-66"], "invalid": "invalid", + "rocm_latest": "/opt/rocm-7.1.0", } def test_all_good_data(analyzer, model_obj, config): - args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"]) + args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"], exp_rocm_latest=config["rocm_latest"]) result = analyzer.analyze_data(model_obj, args) assert result.status == ExecutionStatus.OK - assert result.message == "ROCm version matches expected" + assert "ROCm version matches expected" in result.message + assert "ROCm latest path validated" in result.message assert all( event.priority not in {EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL} for event in result.events @@ -94,3 +96,16 @@ def test_unexpected_rocm_version(analyzer, model_obj): def test_invalid_user_config(analyzer, model_obj, config): result = analyzer.analyze_data(model_obj, None) assert result.status == ExecutionStatus.NOT_RAN + + +def test_rocm_latest_path_mismatch(analyzer, model_obj): + """Test that rocm_latest path mismatch is detected and logged""" + args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"], exp_rocm_latest="/opt/rocm-6.2.0") + result = analyzer.analyze_data(model_obj, args) + assert result.status == ExecutionStatus.ERROR + assert "ROCm latest path mismatch" in result.message + assert "/opt/rocm-6.2.0" in result.message # expected + assert "/opt/rocm-7.1.0" in result.message # actual + for event in result.events: + assert event.priority == EventPriority.CRITICAL + assert event.category == EventCategory.SW_DRIVER.value diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py index 60e63f28..2b419ad1 100644 --- a/test/unit/plugin/test_rocm_collector.py +++ b/test/unit/plugin/test_rocm_collector.py @@ -23,68 +23,266 @@ # SOFTWARE. # ############################################################################### -import copy +from unittest.mock import MagicMock import pytest from nodescraper.enums.eventcategory import EventCategory -from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.executionstatus import ExecutionStatus -from nodescraper.plugins.inband.rocm.analyzer_args import RocmAnalyzerArgs -from nodescraper.plugins.inband.rocm.rocm_analyzer import RocmAnalyzer -from nodescraper.plugins.inband.rocm.rocmdata import RocmDataModel +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.rocm.rocm_collector import RocmCollector @pytest.fixture -def model_obj(): - return RocmDataModel(rocm_version="6.2.0-66") +def collector(system_info, conn_mock): + return RocmCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) -@pytest.fixture -def analyzer(system_info): - return RocmAnalyzer(system_info=system_info) +def test_collect_rocm_version_success(collector): + """Test successful collection of ROCm version from version-rocm file""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=0, + stdout="6.2.0-66", + command="grep . /opt/rocm/.info/version-rocm", + ) + ) + result, data = collector.collect_data() -def test_all_good_data(analyzer, model_obj): - args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"]) - result = analyzer.analyze_data(model_obj, args=args) assert result.status == ExecutionStatus.OK - assert result.message == "ROCm version matches expected" - assert all( - event.priority not in [EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL] - for event in result.events + assert data is not None + assert data.rocm_version == "6.2.0-66" + assert "ROCm version: 6.2.0-66" in result.message + + +def test_collect_rocm_version_fallback(collector): + """Test fallback to version file when version-rocm fails""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock(exit_code=1, stdout="", command="grep . /opt/rocm/.info/version-rocm"), + MagicMock(exit_code=0, stdout="6.2.0-66", command="grep . /opt/rocm/.info/version"), + # Additional commands after finding version + MagicMock(exit_code=1, stdout=""), # latest path + MagicMock(exit_code=1, stdout=""), # all paths + MagicMock(exit_code=1, stdout=""), # rocminfo + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + MagicMock(exit_code=1, stdout=""), # clinfo + MagicMock(exit_code=1, stdout=""), # kfd_proc + ] ) + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == "6.2.0-66" -def test_no_config_data(analyzer, model_obj): - result = analyzer.analyze_data(model_obj) - assert result.status == ExecutionStatus.NOT_RAN +def test_collect_rocm_version_not_found(collector): + """Test when ROCm version cannot be found""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=1, + stdout="", + stderr="No such file or directory", + command="grep . /opt/rocm/.info/version-rocm", + ) + ) -def test_invalid_rocm_version(analyzer, model_obj): - modified_model = copy.deepcopy(model_obj) - modified_model.rocm_version = "some_invalid_version" - args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"]) - result = analyzer.analyze_data(modified_model, args=args) + result, data = collector.collect_data() assert result.status == ExecutionStatus.ERROR - assert "ROCm version mismatch!" in result.message - for event in result.events: - assert event.priority == EventPriority.CRITICAL - assert event.category == EventCategory.SW_DRIVER.value + assert data is None + assert "ROCm version not found" in result.message + assert any(event.category == EventCategory.OS.value for event in result.events) -def test_unexpected_rocm_version(analyzer, model_obj): - args = RocmAnalyzerArgs(exp_rocm=["9.8.7-65", "1.2.3-45"]) - result = analyzer.analyze_data(model_obj, args=args) +def test_collect_all_rocm_data(collector): + """Test collection of all ROCm data including tech support commands""" + # Mock all command outputs in sequence + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path + MagicMock(exit_code=0, stdout="/opt/rocm-1.1.0"), + # All ROCm paths + MagicMock(exit_code=0, stdout="/opt/rocm\n/opt/rocm-1.2.3\n/opt/rocm-5.6.0"), + # rocminfo output + MagicMock( + exit_code=0, + stdout="ROCk module is loaded\nAgent 1\n Name: AMD Instinct MI1234XYZ\n Marketing Name: MI1234XYZ", + ), + # ld.so.conf entries + MagicMock( + exit_code=0, + stdout="/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib\n/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib64", + ), + # ROCm libraries from ldconfig + MagicMock( + exit_code=0, + stdout="librocm_smi64.so.7 (libc6,x86-64) => /opt/rocm/lib/librocm_smi64.so.7\nlibhsa-runtime64.so.1 (libc6,x86-64) => /opt/rocm/lib/libhsa-runtime64.so.1", + ), + # Environment variables + MagicMock( + exit_code=0, + stdout="ROCM_PATH=/opt/rocm\nSLURM_MPI_TYPE=pmi2\n__LMOD_REF_COUNT_MODULEPATH=/share/contrib-modules/.mfiles/Core:1\nMODULEPATH=/share/contrib-modules/", + ), + # clinfo output + MagicMock( + exit_code=0, + stdout="Number of platforms: 1\nPlatform Name: AMD Accelerated Parallel Processing\nPlatform Vendor: Advanced Micro Devices, Inc.\nPlatform Version: OpenCL 2.0 AMD-APP (XXXX.X)\nPlatform Profile: FULL_PROFILE\nPlatform Extensions: cl_khr_icd cl_khr_il_program", + ), + # KFD process list + MagicMock(exit_code=0, stdout="1234\n5678"), + ] + ) - assert result.status == ExecutionStatus.ERROR - assert "ROCm version mismatch!" in result.message - for event in result.events: - assert event.priority == EventPriority.CRITICAL - assert event.category == EventCategory.SW_DRIVER.value + result, data = collector.collect_data() + + # Verify result status + assert result.status == ExecutionStatus.OK + assert data is not None + + # Verify ROCm version + assert data.rocm_version == "6.2.0-66" + + # Verify ROCm latest path + assert data.rocm_latest_versioned_path == "/opt/rocm-1.1.0" + + # Verify all ROCm paths + assert data.rocm_all_paths == ["/opt/rocm", "/opt/rocm-1.2.3", "/opt/rocm-5.6.0"] + + # Verify rocminfo output + assert len(data.rocminfo) == 4 + assert "ROCk module is loaded" in data.rocminfo[0] + assert "AMD Instinct MI1234XYZ" in data.rocminfo[2] + + # Verify ld.so.conf entries + assert len(data.ld_conf_rocm) == 2 + assert "/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib" in data.ld_conf_rocm + assert "/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib64" in data.ld_conf_rocm + + # Verify ROCm libraries + assert len(data.rocm_libs) == 2 + assert any("librocm_smi64" in lib for lib in data.rocm_libs) + assert any("libhsa-runtime64" in lib for lib in data.rocm_libs) + + # Verify environment variables + assert len(data.env_vars) == 4 + assert "ROCM_PATH=/opt/rocm" in data.env_vars + assert "MODULEPATH=/share/contrib-modules/" in data.env_vars + + # Verify clinfo output + assert len(data.clinfo) == 6 + assert "AMD Accelerated Parallel Processing" in data.clinfo[1] + # Verify KFD process list + assert len(data.kfd_proc) == 2 + assert "1234" in data.kfd_proc + assert "5678" in data.kfd_proc -def test_invalid_user_config(analyzer, model_obj): - result = analyzer.analyze_data(model_obj, None) - assert result.status == ExecutionStatus.NOT_RAN + # Verify artifact was created + assert len(result.artifacts) == 1 + assert result.artifacts[0].filename == "rocminfo.log" + assert "ROCMNFO OUTPUT" in result.artifacts[0].contents + assert "CLINFO OUTPUT" in result.artifacts[0].contents + + +def test_collect_with_clinfo_failure(collector): + """Test that clinfo failure is handled gracefully and captured in artifact""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path + MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + # All ROCm paths + MagicMock(exit_code=0, stdout="/opt/rocm"), + # rocminfo success + MagicMock(exit_code=0, stdout="ROCk module loaded"), + # Other commands + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + # clinfo failure + MagicMock( + exit_code=127, + stdout="", + stderr="No such file or directory", + command="/opt/rocm-7.1.0/opencl/bin/*/clinfo", + ), + # kfd_proc + MagicMock(exit_code=0, stdout=""), + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data.clinfo == [] + + # Verify artifact contains error information + assert len(result.artifacts) == 1 + artifact_content = result.artifacts[0].contents + assert "CLINFO OUTPUT" in artifact_content + assert "Exit Code: 127" in artifact_content + assert "No such file or directory" in artifact_content + + +def test_collect_minimal_data(collector): + """Test collection when only version is available""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # All subsequent commands fail + MagicMock(exit_code=1, stdout=""), # latest path + MagicMock(exit_code=1, stdout=""), # all paths + MagicMock(exit_code=1, stdout=""), # rocminfo + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + MagicMock(exit_code=1, stdout=""), # clinfo + MagicMock(exit_code=1, stdout=""), # kfd_proc + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == "6.2.0-66" + + # Verify optional fields have default values + assert data.rocm_latest_versioned_path == "" + assert data.rocm_all_paths == [] + assert data.rocminfo == [] + assert data.ld_conf_rocm == [] + assert data.rocm_libs == [] + assert data.env_vars == [] + assert data.clinfo == [] + assert data.kfd_proc == [] + + +def test_invalid_rocm_version_format(collector): + """Test that invalid ROCm version format is handled gracefully""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=0, + stdout="invalid_version_format", + ) + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.ERROR + assert data is None + assert len(result.events) >= 1 diff --git a/test/unit/plugin/test_sysctl_analyzer.py b/test/unit/plugin/test_sysctl_analyzer.py index ffc943fe..dacca202 100644 --- a/test/unit/plugin/test_sysctl_analyzer.py +++ b/test/unit/plugin/test_sysctl_analyzer.py @@ -1,3 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### import pytest from nodescraper.enums import ExecutionStatus @@ -38,4 +63,6 @@ def test_analyzer_mismatch(analyzer, correct_data): args = SysctlAnalyzerArgs(exp_vm_swappiness=3, exp_vm_numa_balancing=4) result = analyzer.analyze_data(correct_data, args) assert result.status == ExecutionStatus.ERROR - assert "2 sysctl parameter(s) mismatched. (1 errors)" in result.message + assert "2 sysctl parameter(s) mismatched." in result.message + assert "1 errors" in result.message + assert "Sysctl mismatch detected" in result.message diff --git a/test/unit/plugin/test_sysctl_collector.py b/test/unit/plugin/test_sysctl_collector.py index 553a6d41..85e452ef 100644 --- a/test/unit/plugin/test_sysctl_collector.py +++ b/test/unit/plugin/test_sysctl_collector.py @@ -1,3 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### from types import SimpleNamespace import pytest