Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions aci-preupgrade-validation-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6007,6 +6007,156 @@ def apic_vmm_inventory_sync_faults_check(**kwargs):
recommended_action=recommended_action,
doc_url=doc_url)

@check_wrapper(check_title="Switch SSD Diagnostic Test Validation")
def switch_ssd_diag_test_check(username, password, fabric_nodes, **kwargs):
result = PASS
recommended_action = "Contact Cisco TAC to investigate SSD diagnostic test failures."
doc_url = "https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#switch_ssd_diag_test_check"

headers = ["Node Name", "Error Code", "Total Failures", "Fault Code"]
data = []

# Get APIC's own IP address (bind source for SSH)
try:
apic_hostname = run_cmd("bash -c \"hostname\"", splitlines=True)[0].strip()
if not apic_hostname:
return Result(result=ERROR, msg="Could not determine APIC hostname")

apic_ip = next(
(node["fabricNode"]["attributes"].get("address")
for node in fabric_nodes
if node["fabricNode"]["attributes"]["name"] == apic_hostname),
None
)
except Exception as e:
return Result(result=ERROR, msg="Failed to get APIC IP: {}".format(e))

if not apic_ip:
return Result(result=ERROR, msg="Could not determine APIC IP address from fabric nodes")


# Filter active switches only (exclude controllers/APICs)
switches = [
node for node in fabric_nodes
if node["fabricNode"]["attributes"].get("role") != "controller"
and node["fabricNode"]["attributes"].get("fabricSt") == "active"
]

if not switches:
return Result(result=NA, msg="No active switches found in fabric")

# Check F2421 fault for SSD issues
fault_per_node = {}
try:
F2421_faults = icurl('class', 'faultInst.json?query-target-filter=eq(faultInst.code,"F2421")')
for fault in F2421_faults:
fault_dn = fault["faultInst"]["attributes"]["dn"]
node_match = re.search(node_regex, fault_dn)
if node_match:
node_id = node_match.group("node")
fault_per_node[node_id] = True
except Exception as e:
return Result(result=ERROR, msg="Failed to retrieve F2421 faults: {}".format(e))

# SSH to each switch and run diagnostic test command
for switch in switches:
attr = switch["fabricNode"]["attributes"]
node_id = attr.get("id")
node_name = attr.get("name")

try:
# Create SSH connection with APIC IP binding
c = Connection(node_name)
c.username = username
c.password = password
c.bind_ip = apic_ip # Route traffic through APIC inband IP
c.connect()

# Execute diagnostic test command
c.cmd("show diagnostic result module 1 test 24 detail", timeout=60)
output = c.output
c.close()

# Parse output for Error code
# Looking for: "Error code ------------------> DIAG TEST SUCCESS" or "DIAG TEST FAIL"
error_code = None
total_failures = None
total_run_count = None

# Extract Error code
error_match = re.search(r'Error\s+code\s+[-]+>\s+(.+)', output, re.IGNORECASE)
if error_match:
error_code = error_match.group(1).strip()

# Extract Total run count
run_match = re.search(r'Total\s+run\s+count\s+[-]+>\s+(\d+)', output, re.IGNORECASE)
if run_match:
total_run_count = int(run_match.group(1).strip())

# Extract Total failure count
failure_match = re.search(r'Total\s+failure\s+count\s+[-]+>\s+(\d+)', output, re.IGNORECASE)
if failure_match:
total_failures = int(failure_match.group(1).strip())

# Check if test failed based on Error code
if error_code:
# Check if error code contains FAIL or is not SUCCESS
if "FAIL" in error_code.upper() or "SUCCESS" not in error_code.upper():
result = FAIL_O

# Check if F2421 fault exists for this node
fault_code = "F2421" if node_id in fault_per_node else "N/A"

data.append([
node_name,
error_code,
total_failures if total_failures else "N/A",
fault_code
])
elif total_failures == total_run_count and total_run_count > 0:
# Even if current status is SUCCESS, check if there were historical failures
result = FAIL_O
fault_code = "F2421" if node_id in fault_per_node else "N/A"

data.append([
node_name,
error_code,
total_failures,
fault_code
])
else:
# Could not get test results or parse output
data.append([node_name, "SSD Diag Test results are not available", "N/A", "N/A"])
result = ERROR

except pexpect.TIMEOUT:
data.append([node_name, "SSH Timeout", "N/A", "N/A"])
result = ERROR
except pexpect.EOF:
data.append([node_name, "SSH Connection Closed", "N/A", "N/A"])
result = ERROR
except Exception as e:
data.append([node_name, "Error: {}".format(str(e)), "N/A", "N/A"])
result = ERROR

if result == PASS:
msg = "All switches passed SSD diagnostic test 24"
recommended_action = ""
elif result == FAIL_O:
msg = "SSD diagnostic test failures detected on {} switch(es)".format(len(data))
else:
msg = "Errors occurred while checking switches"
recommended_action = "Review the errors and retry the check if necessary"

return Result(
result=result,
msg=msg,
headers=headers,
data=data,
recommended_action=recommended_action,
doc_url=doc_url
)

# ---- Script Execution ----


Expand Down Expand Up @@ -6115,6 +6265,7 @@ class CheckManager:
fabric_port_down_check,
equipment_disk_limits_exceeded,
apic_vmm_inventory_sync_faults_check,
switch_ssd_diag_test_check,

# Configurations
vpc_paired_switches_check,
Expand Down
45 changes: 44 additions & 1 deletion docs/docs/validations.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Items | Faults | This Script
[Fabric Port Status][f19] | F1394: ethpm-if-port-down-fabric | :white_check_mark: | :no_entry_sign:
[Equipment Disk Limits][f20] | F1820: 80% -minor<br>F1821: -major<br>F1822: -critical | :white_check_mark: | :no_entry_sign:
[VMM Inventory Partially Synced][f21] | F0132: comp-ctrlr-operational-issues | :white_check_mark: | :no_entry_sign:

[Switch SSD read alone][f22] | F2421: switch-ssd-read-alone-issue | :white_check_mark: | :no_entry_sign:

[f1]: #apic-disk-space-usage
[f2]: #standby-apic-disk-space-usage
Expand All @@ -103,6 +103,7 @@ Items | Faults | This Script
[f19]: #fabric-port-status
[f20]: #equipment-disk-limits
[f21]: #vmm-inventory-partially-synced
[f22]: #switch_ssd_diag_test_check

### Configuration Checks

Expand Down Expand Up @@ -1516,6 +1517,48 @@ EPGs using the `pre-provision` resolution immediacy do not rely on the VMM inven

This check returns a `MANUAL` result as there are many reasons for a partial inventory sync to be reported. The goal is to ensure that the VMM inventory sync has fully completed before triggering the APIC upgrade to reduce any chance for unexpected inventory changes to occur.

### Switch SSD Diag Test Check

This checks for Switch(es) SSD diag test result and fault code F2421 on APIC. This fault is raised when switch SSD becomes Read Only.

!!! example "Fault Example F2421"
From the APIC CLI:
```
apic1# moquery -c faultInst -f 'fault.Inst.code=="F2421"'
Total Objects shown: 1

# fault.Inst
code : F2421
ack : no
alert : no
annotation :
cause : equipment-diags-failed
changeSet : firstExecFailTs (New: 2025-11-27T10:26:33.000+00:00), lastExecFailQual (New: Failed to verify contents written to file), lastExecFailTs (New: 2025-11-27T10:26:33.000+00:00), lastExecTs (New: 2025-11-27T10:26:33.000+00:00), nextExecTs (New: 2025-11-27T10:26:33.000+00:00), numExec (New: 3), numExecFail (New: 1), operSt (New: fail), operStQual (New: Failed to verify contents written to file)
childAction :
created : 2025-11-27T10:26:33.081+00:00
delegated : no
descr : Diagnostics test failed. reason:Failed to verify contents written to file
dn : topology/pod-1/node-102/sys/diag/rule-ssd-acc-trig-forever/subj-[topology/pod-1/node-102/sys/ch/supslot-1/sup]
domain : infra
extMngdBy : undefined
highestSeverity : critical
lastTransition : 2025-11-27T10:26:33.081+00:00
lc : soaking
modTs : never
occur : 1
origSeverity : critical
prevSeverity : critical
rn : fault-F2421
rule : eqptdiag-subj-oper-st-failed
severity : critical
status :
subject : oper-state-failed
title :
type : operational
uid :
userdom : all
```

## Configuration Check Details

### VPC-paired Leaf switches
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"fabricNode": {
"attributes": {
"address": "10.0.0.1",
"dn": "topology/pod-1/node-1",
"fabricSt": "commissioned",
"id": "1",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic1",
"nodeType": "unspecified",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.2",
"dn": "topology/pod-1/node-2",
"fabricSt": "commissioned",
"id": "2",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic2",
"nodeType": "unspecified",
"role": "controller"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"fabricNode": {
"attributes": {
"address": "10.0.0.1",
"dn": "topology/pod-1/node-1",
"fabricSt": "commissioned",
"id": "1",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic1",
"nodeType": "unspecified",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.101",
"dn": "topology/pod-1/node-101",
"fabricSt": "active",
"id": "101",
"model": "N9K-C93180YC-EX",
"monPolDn": "uni/fabric/monfab-default",
"name": "leaf101",
"nodeType": "unspecified",
"role": "leaf"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"fabricNode": {
"attributes": {
"address": "10.0.0.1",
"dn": "topology/pod-1/node-1",
"fabricSt": "commissioned",
"id": "1",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic1",
"nodeType": "unspecified",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.102",
"dn": "topology/pod-1/node-102",
"fabricSt": "active",
"id": "102",
"model": "N9K-C93180YC-FX",
"monPolDn": "uni/fabric/monfab-default",
"name": "leaf102",
"nodeType": "unspecified",
"role": "leaf"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
[
{
"fabricNode": {
"attributes": {
"address": "10.0.0.1",
"dn": "topology/pod-1/node-1",
"fabricSt": "commissioned",
"id": "1",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic1",
"nodeType": "unspecified",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.101",
"dn": "topology/pod-1/node-101",
"fabricSt": "active",
"id": "101",
"model": "N9K-C93180YC-EX",
"monPolDn": "uni/fabric/monfab-default",
"name": "leaf101",
"nodeType": "unspecified",
"role": "leaf"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.102",
"dn": "topology/pod-1/node-102",
"fabricSt": "active",
"id": "102",
"model": "N9K-C93180YC-FX",
"monPolDn": "uni/fabric/monfab-default",
"name": "leaf102",
"nodeType": "unspecified",
"role": "leaf"
}
}
}
]
Loading