initial
This commit is contained in:
565
ts/hardware/gpu-detector.ts
Normal file
565
ts/hardware/gpu-detector.ts
Normal file
@@ -0,0 +1,565 @@
|
||||
/**
|
||||
* GPU Detector
|
||||
*
|
||||
* Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information.
|
||||
*/
|
||||
|
||||
import { exec } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
import * as fs from 'node:fs';
|
||||
import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { TIMING } from '../constants.ts';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
/**
|
||||
* GPU Detector class for detecting and querying GPU information
|
||||
*/
|
||||
export class GpuDetector {
|
||||
private cachedGpus: IGpuInfo[] | null = null;
|
||||
private cacheTime: number = 0;
|
||||
private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS;
|
||||
|
||||
/**
|
||||
* Detect all GPUs on the system
|
||||
* @param forceRefresh Force refresh even if cache is valid
|
||||
* @returns Array of detected GPU information
|
||||
*/
|
||||
public async detectGpus(forceRefresh: boolean = false): Promise<IGpuInfo[]> {
|
||||
// Return cached data if still valid
|
||||
if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) {
|
||||
return this.cachedGpus;
|
||||
}
|
||||
|
||||
const gpus: IGpuInfo[] = [];
|
||||
|
||||
// Detect NVIDIA GPUs
|
||||
const nvidiaGpus = await this.detectNvidiaGpus();
|
||||
gpus.push(...nvidiaGpus);
|
||||
|
||||
// Detect AMD GPUs
|
||||
const amdGpus = await this.detectAmdGpus();
|
||||
gpus.push(...amdGpus);
|
||||
|
||||
// Detect Intel GPUs
|
||||
const intelGpus = await this.detectIntelGpus();
|
||||
gpus.push(...intelGpus);
|
||||
|
||||
// If no GPUs found via specific tools, try generic detection
|
||||
if (gpus.length === 0) {
|
||||
const genericGpus = await this.detectGenericGpus();
|
||||
gpus.push(...genericGpus);
|
||||
}
|
||||
|
||||
// Update cache
|
||||
this.cachedGpus = gpus;
|
||||
this.cacheTime = Date.now();
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect NVIDIA GPUs using nvidia-smi
|
||||
*/
|
||||
private async detectNvidiaGpus(): Promise<IGpuInfo[]> {
|
||||
const gpus: IGpuInfo[] = [];
|
||||
|
||||
try {
|
||||
// Check if nvidia-smi is available
|
||||
const { stdout } = await execAsync(
|
||||
'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits',
|
||||
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
||||
);
|
||||
|
||||
const lines = stdout.trim().split('\n').filter((line: string) => line.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
const parts = line.split(',').map((p: string) => p.trim());
|
||||
if (parts.length >= 7) {
|
||||
const [index, _uuid, name, memory, driver, pciId, computeCap] = parts;
|
||||
|
||||
gpus.push({
|
||||
id: `nvidia-${index}`,
|
||||
vendor: 'nvidia',
|
||||
model: name,
|
||||
vram: parseInt(memory, 10), // Already in MB
|
||||
driverVersion: driver,
|
||||
computeCapability: computeCap,
|
||||
pciSlot: this.extractPciSlot(pciId),
|
||||
pciBusId: pciId,
|
||||
index: parseInt(index, 10),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Get CUDA version separately
|
||||
if (gpus.length > 0) {
|
||||
try {
|
||||
const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
|
||||
timeout: 5000,
|
||||
});
|
||||
const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
|
||||
if (cudaMatch) {
|
||||
for (const gpu of gpus) {
|
||||
gpu.cudaVersion = cudaMatch[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// CUDA version detection failed, that's okay
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// nvidia-smi not available or failed
|
||||
logger.dim('NVIDIA GPU detection: nvidia-smi not available');
|
||||
}
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect AMD GPUs using rocm-smi or amdgpu-ls
|
||||
*/
|
||||
private async detectAmdGpus(): Promise<IGpuInfo[]> {
|
||||
const gpus: IGpuInfo[] = [];
|
||||
|
||||
try {
|
||||
// Try rocm-smi first
|
||||
const { stdout } = await execAsync(
|
||||
'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null',
|
||||
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
||||
);
|
||||
|
||||
// Parse rocm-smi output
|
||||
if (stdout.includes('{')) {
|
||||
// JSON output
|
||||
const data = JSON.parse(stdout);
|
||||
let index = 0;
|
||||
for (const [key, value] of Object.entries(data)) {
|
||||
if (key.startsWith('card')) {
|
||||
const cardData = value as Record<string, unknown>;
|
||||
gpus.push({
|
||||
id: `amd-${index}`,
|
||||
vendor: 'amd',
|
||||
model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
|
||||
vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
|
||||
driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
|
||||
rocmVersion: await this.getRocmVersion(),
|
||||
pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
|
||||
pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// CSV output - parse line by line
|
||||
const lines = stdout.trim().split('\n');
|
||||
let index = 0;
|
||||
for (const line of lines) {
|
||||
if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) {
|
||||
// This is a GPU entry
|
||||
gpus.push({
|
||||
id: `amd-${index}`,
|
||||
vendor: 'amd',
|
||||
model: line.trim(),
|
||||
vram: 0, // Will need additional parsing
|
||||
pciSlot: '',
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// rocm-smi not available, try lspci
|
||||
try {
|
||||
const { stdout: lspciOut } = await execAsync(
|
||||
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"',
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
|
||||
let index = 0;
|
||||
for (const line of lines) {
|
||||
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
||||
if (match) {
|
||||
gpus.push({
|
||||
id: `amd-${index}`,
|
||||
vendor: 'amd',
|
||||
model: match[2].trim(),
|
||||
vram: await this.getAmdVramFromSysfs(match[1]),
|
||||
pciSlot: match[1],
|
||||
pciBusId: match[1],
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
logger.dim('AMD GPU detection: rocm-smi and lspci detection failed');
|
||||
}
|
||||
}
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect Intel GPUs using intel_gpu_top or xpu-smi
|
||||
*/
|
||||
private async detectIntelGpus(): Promise<IGpuInfo[]> {
|
||||
const gpus: IGpuInfo[] = [];
|
||||
|
||||
try {
|
||||
// Try xpu-smi first (for Intel Arc GPUs)
|
||||
const { stdout } = await execAsync(
|
||||
'xpu-smi discovery --json 2>/dev/null',
|
||||
{ timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
|
||||
);
|
||||
|
||||
const data = JSON.parse(stdout);
|
||||
if (data.device_list) {
|
||||
let index = 0;
|
||||
for (const device of data.device_list) {
|
||||
gpus.push({
|
||||
id: `intel-${index}`,
|
||||
vendor: 'intel',
|
||||
model: device.device_name || 'Intel GPU',
|
||||
vram: device.memory_physical_size_byte
|
||||
? Math.round(device.memory_physical_size_byte / (1024 * 1024))
|
||||
: 0,
|
||||
oneApiVersion: await this.getOneApiVersion(),
|
||||
pciSlot: device.pci_bdf || '',
|
||||
pciBusId: device.pci_bdf || '',
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// xpu-smi not available, try lspci
|
||||
try {
|
||||
const { stdout: lspciOut } = await execAsync(
|
||||
'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"',
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
|
||||
let index = 0;
|
||||
for (const line of lines) {
|
||||
// Skip integrated graphics, only look for discrete Arc GPUs
|
||||
if (line.toLowerCase().includes('arc')) {
|
||||
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
||||
if (match) {
|
||||
gpus.push({
|
||||
id: `intel-${index}`,
|
||||
vendor: 'intel',
|
||||
model: match[2].trim(),
|
||||
vram: 0, // Intel Arc VRAM detection needs sysfs
|
||||
pciSlot: match[1],
|
||||
pciBusId: match[1],
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
logger.dim('Intel GPU detection: xpu-smi and lspci detection failed');
|
||||
}
|
||||
}
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic GPU detection using lspci
|
||||
*/
|
||||
private async detectGenericGpus(): Promise<IGpuInfo[]> {
|
||||
const gpus: IGpuInfo[] = [];
|
||||
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
'lspci -nn | grep -i "VGA\\|3D\\|Display"',
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const lines = stdout.trim().split('\n').filter((l: string) => l.trim());
|
||||
let index = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
|
||||
if (match) {
|
||||
const model = match[2].trim();
|
||||
let vendor: TGpuVendor = 'unknown';
|
||||
|
||||
if (/nvidia/i.test(model)) vendor = 'nvidia';
|
||||
else if (/amd|ati|radeon/i.test(model)) vendor = 'amd';
|
||||
else if (/intel/i.test(model)) vendor = 'intel';
|
||||
|
||||
gpus.push({
|
||||
id: `gpu-${index}`,
|
||||
vendor,
|
||||
model,
|
||||
vram: 0,
|
||||
pciSlot: match[1],
|
||||
pciBusId: match[1],
|
||||
index: index++,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
logger.dim('Generic GPU detection: lspci not available');
|
||||
}
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get real-time status for a specific GPU
|
||||
*/
|
||||
public async getGpuStatus(gpuId: string): Promise<IGpuStatus | null> {
|
||||
const gpus = await this.detectGpus();
|
||||
const gpu = gpus.find((g) => g.id === gpuId);
|
||||
|
||||
if (!gpu) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (gpu.vendor === 'nvidia') {
|
||||
return this.getNvidiaGpuStatus(gpu);
|
||||
} else if (gpu.vendor === 'amd') {
|
||||
return this.getAmdGpuStatus(gpu);
|
||||
} else if (gpu.vendor === 'intel') {
|
||||
return this.getIntelGpuStatus(gpu);
|
||||
}
|
||||
|
||||
// Unknown vendor - return basic status
|
||||
return {
|
||||
id: gpuId,
|
||||
utilization: 0,
|
||||
memoryUsed: 0,
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: 0,
|
||||
temperature: 0,
|
||||
powerUsage: 0,
|
||||
powerLimit: 0,
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get real-time status for all GPUs
|
||||
*/
|
||||
public async getAllGpuStatus(): Promise<Map<string, IGpuStatus>> {
|
||||
const statuses = new Map<string, IGpuStatus>();
|
||||
const gpus = await this.detectGpus();
|
||||
|
||||
for (const gpu of gpus) {
|
||||
const status = await this.getGpuStatus(gpu.id);
|
||||
if (status) {
|
||||
statuses.set(gpu.id, status);
|
||||
}
|
||||
}
|
||||
|
||||
return statuses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get NVIDIA GPU status using nvidia-smi
|
||||
*/
|
||||
private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const parts = stdout.trim().split(',').map((p: string) => p.trim());
|
||||
const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;
|
||||
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: parseInt(utilization, 10) || 0,
|
||||
memoryUsed: parseInt(memUsed, 10) || 0,
|
||||
memoryTotal: parseInt(memTotal, 10) || gpu.vram,
|
||||
memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
|
||||
temperature: parseInt(temp, 10) || 0,
|
||||
powerUsage: parseFloat(power) || 0,
|
||||
powerLimit: parseFloat(powerLimit) || 0,
|
||||
fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined,
|
||||
gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined,
|
||||
memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined,
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: 0,
|
||||
memoryUsed: 0,
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: 0,
|
||||
temperature: 0,
|
||||
powerUsage: 0,
|
||||
powerLimit: 0,
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get AMD GPU status using rocm-smi
|
||||
*/
|
||||
private async getAmdGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const data = JSON.parse(stdout);
|
||||
const cardKey = `card${gpu.index}`;
|
||||
const cardData = data[cardKey] || {};
|
||||
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: parseInt(cardData['GPU use (%)'] || '0', 10),
|
||||
memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'),
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10),
|
||||
temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'),
|
||||
powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'),
|
||||
powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'),
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: 0,
|
||||
memoryUsed: 0,
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: 0,
|
||||
temperature: 0,
|
||||
powerUsage: 0,
|
||||
powerLimit: 0,
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Intel GPU status using xpu-smi
|
||||
*/
|
||||
private async getIntelGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`xpu-smi stats -d ${gpu.index} --json 2>/dev/null`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const data = JSON.parse(stdout);
|
||||
const stats = data.device_level || {};
|
||||
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: Math.round(parseFloat(stats.gpu_utilization || '0')),
|
||||
memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)),
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')),
|
||||
temperature: parseFloat(stats.gpu_temperature || '0'),
|
||||
powerUsage: parseFloat(stats.power || '0'),
|
||||
powerLimit: 0, // Intel doesn't expose this easily
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
} catch {
|
||||
return {
|
||||
id: gpu.id,
|
||||
utilization: 0,
|
||||
memoryUsed: 0,
|
||||
memoryTotal: gpu.vram,
|
||||
memoryPercent: 0,
|
||||
temperature: 0,
|
||||
powerUsage: 0,
|
||||
powerLimit: 0,
|
||||
lastUpdate: Date.now(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to extract PCI slot from full bus ID
|
||||
*/
|
||||
private extractPciSlot(pciId: string): string {
|
||||
// Input: "00000000:01:00.0" -> Output: "01:00.0"
|
||||
const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i);
|
||||
return match ? match[1] : pciId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to parse memory values with units
|
||||
*/
|
||||
private parseMemory(value: string): number {
|
||||
const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i);
|
||||
if (!match) return 0;
|
||||
|
||||
let bytes = parseFloat(match[1]);
|
||||
const unit = (match[2] || 'B').toUpperCase();
|
||||
|
||||
switch (unit) {
|
||||
case 'TB':
|
||||
bytes *= 1024;
|
||||
// falls through
|
||||
case 'GB':
|
||||
bytes *= 1024;
|
||||
// falls through
|
||||
case 'MB':
|
||||
break; // Already in MB
|
||||
case 'KB':
|
||||
bytes /= 1024;
|
||||
break;
|
||||
case 'B':
|
||||
bytes /= (1024 * 1024);
|
||||
break;
|
||||
}
|
||||
|
||||
return Math.round(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get AMD VRAM from sysfs (async)
|
||||
*/
|
||||
private async getAmdVramFromSysfs(pciBusId: string): Promise<number> {
|
||||
try {
|
||||
const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`;
|
||||
const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false);
|
||||
if (exists) {
|
||||
const content = await fs.promises.readFile(sysfsPath, 'utf8');
|
||||
return Math.round(parseInt(content.trim(), 10) / (1024 * 1024));
|
||||
}
|
||||
} catch {
|
||||
// sysfs not available
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get ROCm version
|
||||
*/
|
||||
private async getRocmVersion(): Promise<string | undefined> {
|
||||
try {
|
||||
const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
|
||||
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||||
return match ? match[1] : undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get oneAPI version
|
||||
*/
|
||||
private async getOneApiVersion(): Promise<string | undefined> {
|
||||
try {
|
||||
const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
|
||||
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||||
return match ? match[1] : undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user