566 lines
17 KiB
TypeScript
566 lines
17 KiB
TypeScript
/**
|
|
* GPU Detector
|
|
*
|
|
* Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information.
|
|
*/
|
|
|
|
import { exec } from 'node:child_process';
|
|
import { promisify } from 'node:util';
|
|
import * as fs from 'node:fs';
|
|
import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts';
|
|
import { logger } from '../logger.ts';
|
|
import { TIMING } from '../constants.ts';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
/**
|
|
* GPU Detector class for detecting and querying GPU information
|
|
*/
|
|
export class GpuDetector {
|
|
private cachedGpus: IGpuInfo[] | null = null;
|
|
private cacheTime: number = 0;
|
|
private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS;
|
|
|
|
/**
|
|
* Detect all GPUs on the system
|
|
* @param forceRefresh Force refresh even if cache is valid
|
|
* @returns Array of detected GPU information
|
|
*/
|
|
public async detectGpus(forceRefresh: boolean = false): Promise<IGpuInfo[]> {
|
|
// Return cached data if still valid
|
|
if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) {
|
|
return this.cachedGpus;
|
|
}
|
|
|
|
const gpus: IGpuInfo[] = [];
|
|
|
|
// Detect NVIDIA GPUs
|
|
const nvidiaGpus = await this.detectNvidiaGpus();
|
|
gpus.push(...nvidiaGpus);
|
|
|
|
// Detect AMD GPUs
|
|
const amdGpus = await this.detectAmdGpus();
|
|
gpus.push(...amdGpus);
|
|
|
|
// Detect Intel GPUs
|
|
const intelGpus = await this.detectIntelGpus();
|
|
gpus.push(...intelGpus);
|
|
|
|
// If no GPUs found via specific tools, try generic detection
|
|
if (gpus.length === 0) {
|
|
const genericGpus = await this.detectGenericGpus();
|
|
gpus.push(...genericGpus);
|
|
}
|
|
|
|
// Update cache
|
|
this.cachedGpus = gpus;
|
|
this.cacheTime = Date.now();
|
|
|
|
return gpus;
|
|
}
|
|
|
|
/**
|
|
* Detect NVIDIA GPUs using nvidia-smi
|
|
*/
|
|
private async detectNvidiaGpus(): Promise<IGpuInfo[]> {
|
|
const gpus: IGpuInfo[] = [];
|
|
|
|
try {
|
|
// Check if nvidia-smi is available
|
|
const { stdout } = await execAsync(
|
|
'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits',
|
|
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
|
);
|
|
|
|
const lines = stdout.trim().split('\n').filter((line: string) => line.trim());
|
|
|
|
for (const line of lines) {
|
|
const parts = line.split(',').map((p: string) => p.trim());
|
|
if (parts.length >= 7) {
|
|
const [index, _uuid, name, memory, driver, pciId, computeCap] = parts;
|
|
|
|
gpus.push({
|
|
id: `nvidia-${index}`,
|
|
vendor: 'nvidia',
|
|
model: name,
|
|
vram: parseInt(memory, 10), // Already in MB
|
|
driverVersion: driver,
|
|
computeCapability: computeCap,
|
|
pciSlot: this.extractPciSlot(pciId),
|
|
pciBusId: pciId,
|
|
index: parseInt(index, 10),
|
|
});
|
|
}
|
|
}
|
|
|
|
// Get CUDA version separately
|
|
if (gpus.length > 0) {
|
|
try {
|
|
const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
|
|
timeout: 5000,
|
|
});
|
|
const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
|
|
if (cudaMatch) {
|
|
for (const gpu of gpus) {
|
|
gpu.cudaVersion = cudaMatch[1];
|
|
}
|
|
}
|
|
} catch {
|
|
// CUDA version detection failed, that's okay
|
|
}
|
|
}
|
|
} catch {
|
|
// nvidia-smi not available or failed
|
|
logger.dim('NVIDIA GPU detection: nvidia-smi not available');
|
|
}
|
|
|
|
return gpus;
|
|
}
|
|
|
|
/**
|
|
* Detect AMD GPUs using rocm-smi or amdgpu-ls
|
|
*/
|
|
private async detectAmdGpus(): Promise<IGpuInfo[]> {
|
|
const gpus: IGpuInfo[] = [];
|
|
|
|
try {
|
|
// Try rocm-smi first
|
|
const { stdout } = await execAsync(
|
|
'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null',
|
|
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
|
);
|
|
|
|
// Parse rocm-smi output
|
|
if (stdout.includes('{')) {
|
|
// JSON output
|
|
const data = JSON.parse(stdout);
|
|
let index = 0;
|
|
for (const [key, value] of Object.entries(data)) {
|
|
if (key.startsWith('card')) {
|
|
const cardData = value as Record<string, unknown>;
|
|
gpus.push({
|
|
id: `amd-${index}`,
|
|
vendor: 'amd',
|
|
model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
|
|
vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
|
|
driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
|
|
rocmVersion: await this.getRocmVersion(),
|
|
pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
|
|
pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
} else {
|
|
// CSV output - parse line by line
|
|
const lines = stdout.trim().split('\n');
|
|
let index = 0;
|
|
for (const line of lines) {
|
|
if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) {
|
|
// This is a GPU entry
|
|
gpus.push({
|
|
id: `amd-${index}`,
|
|
vendor: 'amd',
|
|
model: line.trim(),
|
|
vram: 0, // Will need additional parsing
|
|
pciSlot: '',
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
// rocm-smi not available, try lspci
|
|
try {
|
|
const { stdout: lspciOut } = await execAsync(
|
|
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"',
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
|
|
let index = 0;
|
|
for (const line of lines) {
|
|
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
|
if (match) {
|
|
gpus.push({
|
|
id: `amd-${index}`,
|
|
vendor: 'amd',
|
|
model: match[2].trim(),
|
|
vram: await this.getAmdVramFromSysfs(match[1]),
|
|
pciSlot: match[1],
|
|
pciBusId: match[1],
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
} catch {
|
|
logger.dim('AMD GPU detection: rocm-smi and lspci detection failed');
|
|
}
|
|
}
|
|
|
|
return gpus;
|
|
}
|
|
|
|
/**
|
|
* Detect Intel GPUs using intel_gpu_top or xpu-smi
|
|
*/
|
|
private async detectIntelGpus(): Promise<IGpuInfo[]> {
|
|
const gpus: IGpuInfo[] = [];
|
|
|
|
try {
|
|
// Try xpu-smi first (for Intel Arc GPUs)
|
|
const { stdout } = await execAsync(
|
|
'xpu-smi discovery --json 2>/dev/null',
|
|
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
|
);
|
|
|
|
const data = JSON.parse(stdout);
|
|
if (data.device_list) {
|
|
let index = 0;
|
|
for (const device of data.device_list) {
|
|
gpus.push({
|
|
id: `intel-${index}`,
|
|
vendor: 'intel',
|
|
model: device.device_name || 'Intel GPU',
|
|
vram: device.memory_physical_size_byte
|
|
? Math.round(device.memory_physical_size_byte / (1024 * 1024))
|
|
: 0,
|
|
oneApiVersion: await this.getOneApiVersion(),
|
|
pciSlot: device.pci_bdf || '',
|
|
pciBusId: device.pci_bdf || '',
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
} catch {
|
|
// xpu-smi not available, try lspci
|
|
try {
|
|
const { stdout: lspciOut } = await execAsync(
|
|
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"',
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
|
|
let index = 0;
|
|
for (const line of lines) {
|
|
// Skip integrated graphics, only look for discrete Arc GPUs
|
|
if (line.toLowerCase().includes('arc')) {
|
|
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
|
if (match) {
|
|
gpus.push({
|
|
id: `intel-${index}`,
|
|
vendor: 'intel',
|
|
model: match[2].trim(),
|
|
vram: 0, // Intel Arc VRAM detection needs sysfs
|
|
pciSlot: match[1],
|
|
pciBusId: match[1],
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
logger.dim('Intel GPU detection: xpu-smi and lspci detection failed');
|
|
}
|
|
}
|
|
|
|
return gpus;
|
|
}
|
|
|
|
/**
|
|
* Generic GPU detection using lspci
|
|
*/
|
|
private async detectGenericGpus(): Promise<IGpuInfo[]> {
|
|
const gpus: IGpuInfo[] = [];
|
|
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
'lspci -nn | grep -i "VGA\\|3D\\|Display"',
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const lines = stdout.trim().split('\n').filter((l: string) => l.trim());
|
|
let index = 0;
|
|
|
|
for (const line of lines) {
|
|
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
|
if (match) {
|
|
const model = match[2].trim();
|
|
let vendor: TGpuVendor = 'unknown';
|
|
|
|
if (/nvidia/i.test(model)) vendor = 'nvidia';
|
|
else if (/amd|ati|radeon/i.test(model)) vendor = 'amd';
|
|
else if (/intel/i.test(model)) vendor = 'intel';
|
|
|
|
gpus.push({
|
|
id: `gpu-${index}`,
|
|
vendor,
|
|
model,
|
|
vram: 0,
|
|
pciSlot: match[1],
|
|
pciBusId: match[1],
|
|
index: index++,
|
|
});
|
|
}
|
|
}
|
|
} catch {
|
|
logger.dim('Generic GPU detection: lspci not available');
|
|
}
|
|
|
|
return gpus;
|
|
}
|
|
|
|
/**
|
|
* Get real-time status for a specific GPU
|
|
*/
|
|
public async getGpuStatus(gpuId: string): Promise<IGpuStatus | null> {
|
|
const gpus = await this.detectGpus();
|
|
const gpu = gpus.find((g) => g.id === gpuId);
|
|
|
|
if (!gpu) {
|
|
return null;
|
|
}
|
|
|
|
if (gpu.vendor === 'nvidia') {
|
|
return this.getNvidiaGpuStatus(gpu);
|
|
} else if (gpu.vendor === 'amd') {
|
|
return this.getAmdGpuStatus(gpu);
|
|
} else if (gpu.vendor === 'intel') {
|
|
return this.getIntelGpuStatus(gpu);
|
|
}
|
|
|
|
// Unknown vendor - return basic status
|
|
return {
|
|
id: gpuId,
|
|
utilization: 0,
|
|
memoryUsed: 0,
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: 0,
|
|
temperature: 0,
|
|
powerUsage: 0,
|
|
powerLimit: 0,
|
|
lastUpdate: Date.now(),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get real-time status for all GPUs
|
|
*/
|
|
public async getAllGpuStatus(): Promise<Map<string, IGpuStatus>> {
|
|
const statuses = new Map<string, IGpuStatus>();
|
|
const gpus = await this.detectGpus();
|
|
|
|
for (const gpu of gpus) {
|
|
const status = await this.getGpuStatus(gpu.id);
|
|
if (status) {
|
|
statuses.set(gpu.id, status);
|
|
}
|
|
}
|
|
|
|
return statuses;
|
|
}
|
|
|
|
/**
|
|
* Get NVIDIA GPU status using nvidia-smi
|
|
*/
|
|
private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const parts = stdout.trim().split(',').map((p: string) => p.trim());
|
|
const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;
|
|
|
|
return {
|
|
id: gpu.id,
|
|
utilization: parseInt(utilization, 10) || 0,
|
|
memoryUsed: parseInt(memUsed, 10) || 0,
|
|
memoryTotal: parseInt(memTotal, 10) || gpu.vram,
|
|
memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
|
|
temperature: parseInt(temp, 10) || 0,
|
|
powerUsage: parseFloat(power) || 0,
|
|
powerLimit: parseFloat(powerLimit) || 0,
|
|
fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined,
|
|
gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined,
|
|
memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined,
|
|
lastUpdate: Date.now(),
|
|
};
|
|
} catch {
|
|
return {
|
|
id: gpu.id,
|
|
utilization: 0,
|
|
memoryUsed: 0,
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: 0,
|
|
temperature: 0,
|
|
powerUsage: 0,
|
|
powerLimit: 0,
|
|
lastUpdate: Date.now(),
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get AMD GPU status using rocm-smi
|
|
*/
|
|
private async getAmdGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const data = JSON.parse(stdout);
|
|
const cardKey = `card${gpu.index}`;
|
|
const cardData = data[cardKey] || {};
|
|
|
|
return {
|
|
id: gpu.id,
|
|
utilization: parseInt(cardData['GPU use (%)'] || '0', 10),
|
|
memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'),
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10),
|
|
temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'),
|
|
powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'),
|
|
powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'),
|
|
lastUpdate: Date.now(),
|
|
};
|
|
} catch {
|
|
return {
|
|
id: gpu.id,
|
|
utilization: 0,
|
|
memoryUsed: 0,
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: 0,
|
|
temperature: 0,
|
|
powerUsage: 0,
|
|
powerLimit: 0,
|
|
lastUpdate: Date.now(),
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get Intel GPU status using xpu-smi
|
|
*/
|
|
private async getIntelGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`xpu-smi stats -d ${gpu.index} --json 2>/dev/null`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const data = JSON.parse(stdout);
|
|
const stats = data.device_level || {};
|
|
|
|
return {
|
|
id: gpu.id,
|
|
utilization: Math.round(parseFloat(stats.gpu_utilization || '0')),
|
|
memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)),
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')),
|
|
temperature: parseFloat(stats.gpu_temperature || '0'),
|
|
powerUsage: parseFloat(stats.power || '0'),
|
|
powerLimit: 0, // Intel doesn't expose this easily
|
|
lastUpdate: Date.now(),
|
|
};
|
|
} catch {
|
|
return {
|
|
id: gpu.id,
|
|
utilization: 0,
|
|
memoryUsed: 0,
|
|
memoryTotal: gpu.vram,
|
|
memoryPercent: 0,
|
|
temperature: 0,
|
|
powerUsage: 0,
|
|
powerLimit: 0,
|
|
lastUpdate: Date.now(),
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Helper to extract PCI slot from full bus ID
|
|
*/
|
|
private extractPciSlot(pciId: string): string {
|
|
// Input: "00000000:01:00.0" -> Output: "01:00.0"
|
|
const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i);
|
|
return match ? match[1] : pciId;
|
|
}
|
|
|
|
/**
|
|
* Helper to parse memory values with units
|
|
*/
|
|
private parseMemory(value: string): number {
|
|
const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i);
|
|
if (!match) return 0;
|
|
|
|
let bytes = parseFloat(match[1]);
|
|
const unit = (match[2] || 'B').toUpperCase();
|
|
|
|
switch (unit) {
|
|
case 'TB':
|
|
bytes *= 1024;
|
|
// falls through
|
|
case 'GB':
|
|
bytes *= 1024;
|
|
// falls through
|
|
case 'MB':
|
|
break; // Already in MB
|
|
case 'KB':
|
|
bytes /= 1024;
|
|
break;
|
|
case 'B':
|
|
bytes /= (1024 * 1024);
|
|
break;
|
|
}
|
|
|
|
return Math.round(bytes);
|
|
}
|
|
|
|
/**
|
|
* Get AMD VRAM from sysfs (async)
|
|
*/
|
|
private async getAmdVramFromSysfs(pciBusId: string): Promise<number> {
|
|
try {
|
|
const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`;
|
|
const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false);
|
|
if (exists) {
|
|
const content = await fs.promises.readFile(sysfsPath, 'utf8');
|
|
return Math.round(parseInt(content.trim(), 10) / (1024 * 1024));
|
|
}
|
|
} catch {
|
|
// sysfs not available
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Get ROCm version
|
|
*/
|
|
private async getRocmVersion(): Promise<string | undefined> {
|
|
try {
|
|
const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
|
|
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
|
|
return match ? match[1] : undefined;
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get oneAPI version
|
|
*/
|
|
private async getOneApiVersion(): Promise<string | undefined> {
|
|
try {
|
|
const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
|
|
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
|
|
return match ? match[1] : undefined;
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
}
|