
#!/bin/bash
# 用途:扫描所有节点,输出 ECC 错误计数非零的主机名、SN 及错误详情
# 改进:合并命令、设置超时、显示进度
INVENTORY="all.ini"
ANSIBLE_TIMEOUT=30 # 每个 ansible 命令超时时间(秒)
ANSIBLE_PARALLEL=10 # 并发数(若启用并行)
echo "正在扫描所有节点的 NPU ECC 状态(只显示非零错误)..."
echo "注意:每个节点超时设为 ${ANSIBLE_TIMEOUT}s"
# 获取主机列表(去除表头)
hosts=(ansible -i "
for host in $hosts; do
echo -n "检查 $host ... "
# 使用单个 ansible 命令同时获取 SN 和 ECC 信息
output=(timeout ANSIBLE_TIMEOUT ansible -i "INVENTORY" "
echo 'SN:' \$(dmidecode -s system-serial-number 2>/dev/null || echo 'N/A')
for i in \$(seq 0 7); do
npu-smi info -t ecc -i \$i -c 0
done
" 2>/dev/null)
# 检查是否超时或失败
if [ ? -ne 0 ] || [ -z "
echo "超时或无响应,跳过"
continue
fi
# 提取 SN 行
sn_line=(echo "
sn=(echo "
# 提取所有非零 ECC 行(忽略 SN 行和空行)
ecc_lines=(echo "
if [ -n "$ecc_lines" ]; then
echo "发现错误!"
echo "================================"
echo "Host: $host"
echo "SN: $sn"
echo "非零 ECC 计数:"
echo "$ecc_lines"
echo "================================"
else
echo "正常(无错误)"
fi
done
echo "扫描完成。"