/** * GPU Detector * * Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information. */ import { exec } from 'node:child_process'; import { promisify } from 'node:util'; import * as fs from 'node:fs'; import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts'; import { logger } from '../logger.ts'; import { TIMING } from '../constants.ts'; const execAsync = promisify(exec); /** * GPU Detector class for detecting and querying GPU information */ export class GpuDetector { private cachedGpus: IGpuInfo[] | null = null; private cacheTime: number = 0; private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS; /** * Detect all GPUs on the system * @param forceRefresh Force refresh even if cache is valid * @returns Array of detected GPU information */ public async detectGpus(forceRefresh: boolean = false): Promise { // Return cached data if still valid if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) { return this.cachedGpus; } const gpus: IGpuInfo[] = []; // Detect NVIDIA GPUs const nvidiaGpus = await this.detectNvidiaGpus(); gpus.push(...nvidiaGpus); // Detect AMD GPUs const amdGpus = await this.detectAmdGpus(); gpus.push(...amdGpus); // Detect Intel GPUs const intelGpus = await this.detectIntelGpus(); gpus.push(...intelGpus); // If no GPUs found via specific tools, try generic detection if (gpus.length === 0) { const genericGpus = await this.detectGenericGpus(); gpus.push(...genericGpus); } // Update cache this.cachedGpus = gpus; this.cacheTime = Date.now(); return gpus; } /** * Detect NVIDIA GPUs using nvidia-smi */ private async detectNvidiaGpus(): Promise { const gpus: IGpuInfo[] = []; try { // Check if nvidia-smi is available const { stdout } = await execAsync( 'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits', { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS }, ); const lines = stdout.trim().split('\n').filter((line: string) => line.trim()); for (const line of lines) { const parts = line.split(',').map((p: string) => p.trim()); if (parts.length >= 7) { const [index, _uuid, name, memory, driver, pciId, computeCap] = parts; gpus.push({ id: `nvidia-${index}`, vendor: 'nvidia', model: name, vram: parseInt(memory, 10), // Already in MB driverVersion: driver, computeCapability: computeCap, pciSlot: this.extractPciSlot(pciId), pciBusId: pciId, index: parseInt(index, 10), }); } } // Get CUDA version separately if (gpus.length > 0) { try { const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', { timeout: 5000, }); const cudaMatch = cudaOut.match(/(\d+\.\d+)/); if (cudaMatch) { for (const gpu of gpus) { gpu.cudaVersion = cudaMatch[1]; } } } catch { // CUDA version detection failed, that's okay } } } catch { // nvidia-smi not available or failed logger.dim('NVIDIA GPU detection: nvidia-smi not available'); } return gpus; } /** * Detect AMD GPUs using rocm-smi or amdgpu-ls */ private async detectAmdGpus(): Promise { const gpus: IGpuInfo[] = []; try { // Try rocm-smi first const { stdout } = await execAsync( 'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null', { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS }, ); // Parse rocm-smi output if (stdout.includes('{')) { // JSON output const data = JSON.parse(stdout); let index = 0; for (const [key, value] of Object.entries(data)) { if (key.startsWith('card')) { const cardData = value as Record; gpus.push({ id: `amd-${index}`, vendor: 'amd', model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'), vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')), driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''), rocmVersion: await this.getRocmVersion(), pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''), pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''), index: index++, }); } } } else { // CSV output - parse line by line const lines = stdout.trim().split('\n'); let index = 0; for (const line of lines) { if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) { // This is a GPU entry gpus.push({ id: `amd-${index}`, vendor: 'amd', model: line.trim(), vram: 0, // Will need additional parsing pciSlot: '', index: index++, }); } } } } catch { // rocm-smi not available, try lspci try { const { stdout: lspciOut } = await execAsync( 'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"', { timeout: 5000 }, ); const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim()); let index = 0; for (const line of lines) { const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i); if (match) { gpus.push({ id: `amd-${index}`, vendor: 'amd', model: match[2].trim(), vram: await this.getAmdVramFromSysfs(match[1]), pciSlot: match[1], pciBusId: match[1], index: index++, }); } } } catch { logger.dim('AMD GPU detection: rocm-smi and lspci detection failed'); } } return gpus; } /** * Detect Intel GPUs using intel_gpu_top or xpu-smi */ private async detectIntelGpus(): Promise { const gpus: IGpuInfo[] = []; try { // Try xpu-smi first (for Intel Arc GPUs) const { stdout } = await execAsync( 'xpu-smi discovery --json 2>/dev/null', { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS }, ); const data = JSON.parse(stdout); if (data.device_list) { let index = 0; for (const device of data.device_list) { gpus.push({ id: `intel-${index}`, vendor: 'intel', model: device.device_name || 'Intel GPU', vram: device.memory_physical_size_byte ? Math.round(device.memory_physical_size_byte / (1024 * 1024)) : 0, oneApiVersion: await this.getOneApiVersion(), pciSlot: device.pci_bdf || '', pciBusId: device.pci_bdf || '', index: index++, }); } } } catch { // xpu-smi not available, try lspci try { const { stdout: lspciOut } = await execAsync( 'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"', { timeout: 5000 }, ); const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim()); let index = 0; for (const line of lines) { // Skip integrated graphics, only look for discrete Arc GPUs if (line.toLowerCase().includes('arc')) { const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i); if (match) { gpus.push({ id: `intel-${index}`, vendor: 'intel', model: match[2].trim(), vram: 0, // Intel Arc VRAM detection needs sysfs pciSlot: match[1], pciBusId: match[1], index: index++, }); } } } } catch { logger.dim('Intel GPU detection: xpu-smi and lspci detection failed'); } } return gpus; } /** * Generic GPU detection using lspci */ private async detectGenericGpus(): Promise { const gpus: IGpuInfo[] = []; try { const { stdout } = await execAsync( 'lspci -nn | grep -i "VGA\\|3D\\|Display"', { timeout: 5000 }, ); const lines = stdout.trim().split('\n').filter((l: string) => l.trim()); let index = 0; for (const line of lines) { const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i); if (match) { const model = match[2].trim(); let vendor: TGpuVendor = 'unknown'; if (/nvidia/i.test(model)) vendor = 'nvidia'; else if (/amd|ati|radeon/i.test(model)) vendor = 'amd'; else if (/intel/i.test(model)) vendor = 'intel'; gpus.push({ id: `gpu-${index}`, vendor, model, vram: 0, pciSlot: match[1], pciBusId: match[1], index: index++, }); } } } catch { logger.dim('Generic GPU detection: lspci not available'); } return gpus; } /** * Get real-time status for a specific GPU */ public async getGpuStatus(gpuId: string): Promise { const gpus = await this.detectGpus(); const gpu = gpus.find((g) => g.id === gpuId); if (!gpu) { return null; } if (gpu.vendor === 'nvidia') { return this.getNvidiaGpuStatus(gpu); } else if (gpu.vendor === 'amd') { return this.getAmdGpuStatus(gpu); } else if (gpu.vendor === 'intel') { return this.getIntelGpuStatus(gpu); } // Unknown vendor - return basic status return { id: gpuId, utilization: 0, memoryUsed: 0, memoryTotal: gpu.vram, memoryPercent: 0, temperature: 0, powerUsage: 0, powerLimit: 0, lastUpdate: Date.now(), }; } /** * Get real-time status for all GPUs */ public async getAllGpuStatus(): Promise> { const statuses = new Map(); const gpus = await this.detectGpus(); for (const gpu of gpus) { const status = await this.getGpuStatus(gpu.id); if (status) { statuses.set(gpu.id, status); } } return statuses; } /** * Get NVIDIA GPU status using nvidia-smi */ private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise { try { const { stdout } = await execAsync( `nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`, { timeout: 5000 }, ); const parts = stdout.trim().split(',').map((p: string) => p.trim()); const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts; return { id: gpu.id, utilization: parseInt(utilization, 10) || 0, memoryUsed: parseInt(memUsed, 10) || 0, memoryTotal: parseInt(memTotal, 10) || gpu.vram, memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0, temperature: parseInt(temp, 10) || 0, powerUsage: parseFloat(power) || 0, powerLimit: parseFloat(powerLimit) || 0, fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined, gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined, memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined, lastUpdate: Date.now(), }; } catch { return { id: gpu.id, utilization: 0, memoryUsed: 0, memoryTotal: gpu.vram, memoryPercent: 0, temperature: 0, powerUsage: 0, powerLimit: 0, lastUpdate: Date.now(), }; } } /** * Get AMD GPU status using rocm-smi */ private async getAmdGpuStatus(gpu: IGpuInfo): Promise { try { const { stdout } = await execAsync( `rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`, { timeout: 5000 }, ); const data = JSON.parse(stdout); const cardKey = `card${gpu.index}`; const cardData = data[cardKey] || {}; return { id: gpu.id, utilization: parseInt(cardData['GPU use (%)'] || '0', 10), memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'), memoryTotal: gpu.vram, memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10), temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'), powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'), powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'), lastUpdate: Date.now(), }; } catch { return { id: gpu.id, utilization: 0, memoryUsed: 0, memoryTotal: gpu.vram, memoryPercent: 0, temperature: 0, powerUsage: 0, powerLimit: 0, lastUpdate: Date.now(), }; } } /** * Get Intel GPU status using xpu-smi */ private async getIntelGpuStatus(gpu: IGpuInfo): Promise { try { const { stdout } = await execAsync( `xpu-smi stats -d ${gpu.index} --json 2>/dev/null`, { timeout: 5000 }, ); const data = JSON.parse(stdout); const stats = data.device_level || {}; return { id: gpu.id, utilization: Math.round(parseFloat(stats.gpu_utilization || '0')), memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)), memoryTotal: gpu.vram, memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')), temperature: parseFloat(stats.gpu_temperature || '0'), powerUsage: parseFloat(stats.power || '0'), powerLimit: 0, // Intel doesn't expose this easily lastUpdate: Date.now(), }; } catch { return { id: gpu.id, utilization: 0, memoryUsed: 0, memoryTotal: gpu.vram, memoryPercent: 0, temperature: 0, powerUsage: 0, powerLimit: 0, lastUpdate: Date.now(), }; } } /** * Helper to extract PCI slot from full bus ID */ private extractPciSlot(pciId: string): string { // Input: "00000000:01:00.0" -> Output: "01:00.0" const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i); return match ? match[1] : pciId; } /** * Helper to parse memory values with units */ private parseMemory(value: string): number { const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i); if (!match) return 0; let bytes = parseFloat(match[1]); const unit = (match[2] || 'B').toUpperCase(); switch (unit) { case 'TB': bytes *= 1024; // falls through case 'GB': bytes *= 1024; // falls through case 'MB': break; // Already in MB case 'KB': bytes /= 1024; break; case 'B': bytes /= (1024 * 1024); break; } return Math.round(bytes); } /** * Get AMD VRAM from sysfs (async) */ private async getAmdVramFromSysfs(pciBusId: string): Promise { try { const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`; const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false); if (exists) { const content = await fs.promises.readFile(sysfsPath, 'utf8'); return Math.round(parseInt(content.trim(), 10) / (1024 * 1024)); } } catch { // sysfs not available } return 0; } /** * Get ROCm version */ private async getRocmVersion(): Promise { try { const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1'); const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/); return match ? match[1] : undefined; } catch { return undefined; } } /** * Get oneAPI version */ private async getOneApiVersion(): Promise { try { const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION'); const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/); return match ? match[1] : undefined; } catch { return undefined; } } }