Files
modelgrid/ts/hardware/gpu-detector.ts

566 lines
17 KiB
TypeScript
Raw Permalink Normal View History

2026-01-30 03:16:57 +00:00
/**
* GPU Detector
*
* Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information.
*/
import { exec } from 'node:child_process';
import { promisify } from 'node:util';
import * as fs from 'node:fs';
import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts';
import { logger } from '../logger.ts';
import { TIMING } from '../constants.ts';
const execAsync = promisify(exec);
/**
* GPU Detector class for detecting and querying GPU information
*/
export class GpuDetector {
private cachedGpus: IGpuInfo[] | null = null;
private cacheTime: number = 0;
private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS;
/**
* Detect all GPUs on the system
* @param forceRefresh Force refresh even if cache is valid
* @returns Array of detected GPU information
*/
public async detectGpus(forceRefresh: boolean = false): Promise<IGpuInfo[]> {
// Return cached data if still valid
if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) {
return this.cachedGpus;
}
const gpus: IGpuInfo[] = [];
// Detect NVIDIA GPUs
const nvidiaGpus = await this.detectNvidiaGpus();
gpus.push(...nvidiaGpus);
// Detect AMD GPUs
const amdGpus = await this.detectAmdGpus();
gpus.push(...amdGpus);
// Detect Intel GPUs
const intelGpus = await this.detectIntelGpus();
gpus.push(...intelGpus);
// If no GPUs found via specific tools, try generic detection
if (gpus.length === 0) {
const genericGpus = await this.detectGenericGpus();
gpus.push(...genericGpus);
}
// Update cache
this.cachedGpus = gpus;
this.cacheTime = Date.now();
return gpus;
}
/**
* Detect NVIDIA GPUs using nvidia-smi
*/
private async detectNvidiaGpus(): Promise<IGpuInfo[]> {
const gpus: IGpuInfo[] = [];
try {
// Check if nvidia-smi is available
const { stdout } = await execAsync(
'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits',
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
);
const lines = stdout.trim().split('\n').filter((line: string) => line.trim());
for (const line of lines) {
const parts = line.split(',').map((p: string) => p.trim());
if (parts.length >= 7) {
const [index, _uuid, name, memory, driver, pciId, computeCap] = parts;
gpus.push({
id: `nvidia-${index}`,
vendor: 'nvidia',
model: name,
vram: parseInt(memory, 10), // Already in MB
driverVersion: driver,
computeCapability: computeCap,
pciSlot: this.extractPciSlot(pciId),
pciBusId: pciId,
index: parseInt(index, 10),
});
}
}
// Get CUDA version separately
if (gpus.length > 0) {
try {
const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
timeout: 5000,
});
const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
if (cudaMatch) {
for (const gpu of gpus) {
gpu.cudaVersion = cudaMatch[1];
}
}
} catch {
// CUDA version detection failed, that's okay
}
}
} catch {
// nvidia-smi not available or failed
logger.dim('NVIDIA GPU detection: nvidia-smi not available');
}
return gpus;
}
/**
* Detect AMD GPUs using rocm-smi or amdgpu-ls
*/
private async detectAmdGpus(): Promise<IGpuInfo[]> {
const gpus: IGpuInfo[] = [];
try {
// Try rocm-smi first
const { stdout } = await execAsync(
'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null',
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
);
// Parse rocm-smi output
if (stdout.includes('{')) {
// JSON output
const data = JSON.parse(stdout);
let index = 0;
for (const [key, value] of Object.entries(data)) {
if (key.startsWith('card')) {
const cardData = value as Record<string, unknown>;
gpus.push({
id: `amd-${index}`,
vendor: 'amd',
model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
rocmVersion: await this.getRocmVersion(),
pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
index: index++,
});
}
}
} else {
// CSV output - parse line by line
const lines = stdout.trim().split('\n');
let index = 0;
for (const line of lines) {
if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) {
// This is a GPU entry
gpus.push({
id: `amd-${index}`,
vendor: 'amd',
model: line.trim(),
vram: 0, // Will need additional parsing
pciSlot: '',
index: index++,
});
}
}
}
} catch {
// rocm-smi not available, try lspci
try {
const { stdout: lspciOut } = await execAsync(
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"',
{ timeout: 5000 },
);
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
let index = 0;
for (const line of lines) {
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
if (match) {
gpus.push({
id: `amd-${index}`,
vendor: 'amd',
model: match[2].trim(),
vram: await this.getAmdVramFromSysfs(match[1]),
pciSlot: match[1],
pciBusId: match[1],
index: index++,
});
}
}
} catch {
logger.dim('AMD GPU detection: rocm-smi and lspci detection failed');
}
}
return gpus;
}
/**
* Detect Intel GPUs using intel_gpu_top or xpu-smi
*/
private async detectIntelGpus(): Promise<IGpuInfo[]> {
const gpus: IGpuInfo[] = [];
try {
// Try xpu-smi first (for Intel Arc GPUs)
const { stdout } = await execAsync(
'xpu-smi discovery --json 2>/dev/null',
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
);
const data = JSON.parse(stdout);
if (data.device_list) {
let index = 0;
for (const device of data.device_list) {
gpus.push({
id: `intel-${index}`,
vendor: 'intel',
model: device.device_name || 'Intel GPU',
vram: device.memory_physical_size_byte
? Math.round(device.memory_physical_size_byte / (1024 * 1024))
: 0,
oneApiVersion: await this.getOneApiVersion(),
pciSlot: device.pci_bdf || '',
pciBusId: device.pci_bdf || '',
index: index++,
});
}
}
} catch {
// xpu-smi not available, try lspci
try {
const { stdout: lspciOut } = await execAsync(
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"',
{ timeout: 5000 },
);
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
let index = 0;
for (const line of lines) {
// Skip integrated graphics, only look for discrete Arc GPUs
if (line.toLowerCase().includes('arc')) {
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
if (match) {
gpus.push({
id: `intel-${index}`,
vendor: 'intel',
model: match[2].trim(),
vram: 0, // Intel Arc VRAM detection needs sysfs
pciSlot: match[1],
pciBusId: match[1],
index: index++,
});
}
}
}
} catch {
logger.dim('Intel GPU detection: xpu-smi and lspci detection failed');
}
}
return gpus;
}
/**
* Generic GPU detection using lspci
*/
private async detectGenericGpus(): Promise<IGpuInfo[]> {
const gpus: IGpuInfo[] = [];
try {
const { stdout } = await execAsync(
'lspci -nn | grep -i "VGA\\|3D\\|Display"',
{ timeout: 5000 },
);
const lines = stdout.trim().split('\n').filter((l: string) => l.trim());
let index = 0;
for (const line of lines) {
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
if (match) {
const model = match[2].trim();
let vendor: TGpuVendor = 'unknown';
if (/nvidia/i.test(model)) vendor = 'nvidia';
else if (/amd|ati|radeon/i.test(model)) vendor = 'amd';
else if (/intel/i.test(model)) vendor = 'intel';
gpus.push({
id: `gpu-${index}`,
vendor,
model,
vram: 0,
pciSlot: match[1],
pciBusId: match[1],
index: index++,
});
}
}
} catch {
logger.dim('Generic GPU detection: lspci not available');
}
return gpus;
}
/**
* Get real-time status for a specific GPU
*/
public async getGpuStatus(gpuId: string): Promise<IGpuStatus | null> {
const gpus = await this.detectGpus();
const gpu = gpus.find((g) => g.id === gpuId);
if (!gpu) {
return null;
}
if (gpu.vendor === 'nvidia') {
return this.getNvidiaGpuStatus(gpu);
} else if (gpu.vendor === 'amd') {
return this.getAmdGpuStatus(gpu);
} else if (gpu.vendor === 'intel') {
return this.getIntelGpuStatus(gpu);
}
// Unknown vendor - return basic status
return {
id: gpuId,
utilization: 0,
memoryUsed: 0,
memoryTotal: gpu.vram,
memoryPercent: 0,
temperature: 0,
powerUsage: 0,
powerLimit: 0,
lastUpdate: Date.now(),
};
}
/**
* Get real-time status for all GPUs
*/
public async getAllGpuStatus(): Promise<Map<string, IGpuStatus>> {
const statuses = new Map<string, IGpuStatus>();
const gpus = await this.detectGpus();
for (const gpu of gpus) {
const status = await this.getGpuStatus(gpu.id);
if (status) {
statuses.set(gpu.id, status);
}
}
return statuses;
}
/**
* Get NVIDIA GPU status using nvidia-smi
*/
private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
try {
const { stdout } = await execAsync(
`nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`,
{ timeout: 5000 },
);
const parts = stdout.trim().split(',').map((p: string) => p.trim());
const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;
return {
id: gpu.id,
utilization: parseInt(utilization, 10) || 0,
memoryUsed: parseInt(memUsed, 10) || 0,
memoryTotal: parseInt(memTotal, 10) || gpu.vram,
memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
temperature: parseInt(temp, 10) || 0,
powerUsage: parseFloat(power) || 0,
powerLimit: parseFloat(powerLimit) || 0,
fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined,
gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined,
memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined,
lastUpdate: Date.now(),
};
} catch {
return {
id: gpu.id,
utilization: 0,
memoryUsed: 0,
memoryTotal: gpu.vram,
memoryPercent: 0,
temperature: 0,
powerUsage: 0,
powerLimit: 0,
lastUpdate: Date.now(),
};
}
}
/**
* Get AMD GPU status using rocm-smi
*/
private async getAmdGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
try {
const { stdout } = await execAsync(
`rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`,
{ timeout: 5000 },
);
const data = JSON.parse(stdout);
const cardKey = `card${gpu.index}`;
const cardData = data[cardKey] || {};
return {
id: gpu.id,
utilization: parseInt(cardData['GPU use (%)'] || '0', 10),
memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'),
memoryTotal: gpu.vram,
memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10),
temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'),
powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'),
powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'),
lastUpdate: Date.now(),
};
} catch {
return {
id: gpu.id,
utilization: 0,
memoryUsed: 0,
memoryTotal: gpu.vram,
memoryPercent: 0,
temperature: 0,
powerUsage: 0,
powerLimit: 0,
lastUpdate: Date.now(),
};
}
}
/**
* Get Intel GPU status using xpu-smi
*/
private async getIntelGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
try {
const { stdout } = await execAsync(
`xpu-smi stats -d ${gpu.index} --json 2>/dev/null`,
{ timeout: 5000 },
);
const data = JSON.parse(stdout);
const stats = data.device_level || {};
return {
id: gpu.id,
utilization: Math.round(parseFloat(stats.gpu_utilization || '0')),
memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)),
memoryTotal: gpu.vram,
memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')),
temperature: parseFloat(stats.gpu_temperature || '0'),
powerUsage: parseFloat(stats.power || '0'),
powerLimit: 0, // Intel doesn't expose this easily
lastUpdate: Date.now(),
};
} catch {
return {
id: gpu.id,
utilization: 0,
memoryUsed: 0,
memoryTotal: gpu.vram,
memoryPercent: 0,
temperature: 0,
powerUsage: 0,
powerLimit: 0,
lastUpdate: Date.now(),
};
}
}
/**
* Helper to extract PCI slot from full bus ID
*/
private extractPciSlot(pciId: string): string {
// Input: "00000000:01:00.0" -> Output: "01:00.0"
const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i);
return match ? match[1] : pciId;
}
/**
* Helper to parse memory values with units
*/
private parseMemory(value: string): number {
const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i);
if (!match) return 0;
let bytes = parseFloat(match[1]);
const unit = (match[2] || 'B').toUpperCase();
switch (unit) {
case 'TB':
bytes *= 1024;
// falls through
case 'GB':
bytes *= 1024;
// falls through
case 'MB':
break; // Already in MB
case 'KB':
bytes /= 1024;
break;
case 'B':
bytes /= (1024 * 1024);
break;
}
return Math.round(bytes);
}
/**
* Get AMD VRAM from sysfs (async)
*/
private async getAmdVramFromSysfs(pciBusId: string): Promise<number> {
try {
const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`;
const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false);
if (exists) {
const content = await fs.promises.readFile(sysfsPath, 'utf8');
return Math.round(parseInt(content.trim(), 10) / (1024 * 1024));
}
} catch {
// sysfs not available
}
return 0;
}
/**
* Get ROCm version
*/
private async getRocmVersion(): Promise<string | undefined> {
try {
const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
return match ? match[1] : undefined;
} catch {
return undefined;
}
}
/**
* Get oneAPI version
*/
private async getOneApiVersion(): Promise<string | undefined> {
try {
const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
return match ? match[1] : undefined;
} catch {
return undefined;
}
}
}