initial

2026-01-30 03:16:57 +00:00
commit daaf6559e3
80 changed files with 14430 additions and 0 deletions
--- a/ts/hardware/gpu-detector.ts
+++ b/ts/hardware/gpu-detector.ts
@@ -0,0 +1,565 @@
+/**
+ * GPU Detector
+ *
+ * Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information.
+ */
+
+import { exec } from 'node:child_process';
+import { promisify } from 'node:util';
+import * as fs from 'node:fs';
+import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts';
+import { logger } from '../logger.ts';
+import { TIMING } from '../constants.ts';
+
+const execAsync = promisify(exec);
+
+/**
+ * GPU Detector class for detecting and querying GPU information
+ */
+export class GpuDetector {
+  private cachedGpus: IGpuInfo[] | null = null;
+  private cacheTime: number = 0;
+  private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS;
+
+  /**
+   * Detect all GPUs on the system
+   * @param forceRefresh Force refresh even if cache is valid
+   * @returns Array of detected GPU information
+   */
+  public async detectGpus(forceRefresh: boolean = false): Promise<IGpuInfo[]> {
+    // Return cached data if still valid
+    if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) {
+      return this.cachedGpus;
+    }
+
+    const gpus: IGpuInfo[] = [];
+
+    // Detect NVIDIA GPUs
+    const nvidiaGpus = await this.detectNvidiaGpus();
+    gpus.push(...nvidiaGpus);
+
+    // Detect AMD GPUs
+    const amdGpus = await this.detectAmdGpus();
+    gpus.push(...amdGpus);
+
+    // Detect Intel GPUs
+    const intelGpus = await this.detectIntelGpus();
+    gpus.push(...intelGpus);
+
+    // If no GPUs found via specific tools, try generic detection
+    if (gpus.length === 0) {
+      const genericGpus = await this.detectGenericGpus();
+      gpus.push(...genericGpus);
+    }
+
+    // Update cache
+    this.cachedGpus = gpus;
+    this.cacheTime = Date.now();
+
+    return gpus;
+  }
+
+  /**
+   * Detect NVIDIA GPUs using nvidia-smi
+   */
+  private async detectNvidiaGpus(): Promise<IGpuInfo[]> {
+    const gpus: IGpuInfo[] = [];
+
+    try {
+      // Check if nvidia-smi is available
+      const { stdout } = await execAsync(
+        'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits',
+        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
+      );
+
+      const lines = stdout.trim().split('\n').filter((line: string) => line.trim());
+
+      for (const line of lines) {
+        const parts = line.split(',').map((p: string) => p.trim());
+        if (parts.length >= 7) {
+          const [index, _uuid, name, memory, driver, pciId, computeCap] = parts;
+
+          gpus.push({
+            id: `nvidia-${index}`,
+            vendor: 'nvidia',
+            model: name,
+            vram: parseInt(memory, 10), // Already in MB
+            driverVersion: driver,
+            computeCapability: computeCap,
+            pciSlot: this.extractPciSlot(pciId),
+            pciBusId: pciId,
+            index: parseInt(index, 10),
+          });
+        }
+      }
+
+      // Get CUDA version separately
+      if (gpus.length > 0) {
+        try {
+          const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
+            timeout: 5000,
+          });
+          const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
+          if (cudaMatch) {
+            for (const gpu of gpus) {
+              gpu.cudaVersion = cudaMatch[1];
+            }
+          }
+        } catch {
+          // CUDA version detection failed, that's okay
+        }
+      }
+    } catch {
+      // nvidia-smi not available or failed
+      logger.dim('NVIDIA GPU detection: nvidia-smi not available');
+    }
+
+    return gpus;
+  }
+
+  /**
+   * Detect AMD GPUs using rocm-smi or amdgpu-ls
+   */
+  private async detectAmdGpus(): Promise<IGpuInfo[]> {
+    const gpus: IGpuInfo[] = [];
+
+    try {
+      // Try rocm-smi first
+      const { stdout } = await execAsync(
+        'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null',
+        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
+      );
+
+      // Parse rocm-smi output
+      if (stdout.includes('{')) {
+        // JSON output
+        const data = JSON.parse(stdout);
+        let index = 0;
+        for (const [key, value] of Object.entries(data)) {
+          if (key.startsWith('card')) {
+            const cardData = value as Record<string, unknown>;
+            gpus.push({
+              id: `amd-${index}`,
+              vendor: 'amd',
+              model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
+              vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
+              driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
+              rocmVersion: await this.getRocmVersion(),
+              pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
+              pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
+              index: index++,
+            });
+          }
+        }
+      } else {
+        // CSV output - parse line by line
+        const lines = stdout.trim().split('\n');
+        let index = 0;
+        for (const line of lines) {
+          if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) {
+            // This is a GPU entry
+            gpus.push({
+              id: `amd-${index}`,
+              vendor: 'amd',
+              model: line.trim(),
+              vram: 0, // Will need additional parsing
+              pciSlot: '',
+              index: index++,
+            });
+          }
+        }
+      }
+    } catch {
+      // rocm-smi not available, try lspci
+      try {
+        const { stdout: lspciOut } = await execAsync(
+          'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"',
+          { timeout: 5000 },
+        );
+
+        const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
+        let index = 0;
+        for (const line of lines) {
+          const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
+          if (match) {
+            gpus.push({
+              id: `amd-${index}`,
+              vendor: 'amd',
+              model: match[2].trim(),
+              vram: await this.getAmdVramFromSysfs(match[1]),
+              pciSlot: match[1],
+              pciBusId: match[1],
+              index: index++,
+            });
+          }
+        }
+      } catch {
+        logger.dim('AMD GPU detection: rocm-smi and lspci detection failed');
+      }
+    }
+
+    return gpus;
+  }
+
+  /**
+   * Detect Intel GPUs using intel_gpu_top or xpu-smi
+   */
+  private async detectIntelGpus(): Promise<IGpuInfo[]> {
+    const gpus: IGpuInfo[] = [];
+
+    try {
+      // Try xpu-smi first (for Intel Arc GPUs)
+      const { stdout } = await execAsync(
+        'xpu-smi discovery --json 2>/dev/null',
+        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
+      );
+
+      const data = JSON.parse(stdout);
+      if (data.device_list) {
+        let index = 0;
+        for (const device of data.device_list) {
+          gpus.push({
+            id: `intel-${index}`,
+            vendor: 'intel',
+            model: device.device_name || 'Intel GPU',
+            vram: device.memory_physical_size_byte
+              ? Math.round(device.memory_physical_size_byte / (1024 * 1024))
+              : 0,
+            oneApiVersion: await this.getOneApiVersion(),
+            pciSlot: device.pci_bdf || '',
+            pciBusId: device.pci_bdf || '',
+            index: index++,
+          });
+        }
+      }
+    } catch {
+      // xpu-smi not available, try lspci
+      try {
+        const { stdout: lspciOut } = await execAsync(
+          'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"',
+          { timeout: 5000 },
+        );
+
+        const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
+        let index = 0;
+        for (const line of lines) {
+          // Skip integrated graphics, only look for discrete Arc GPUs
+          if (line.toLowerCase().includes('arc')) {
+            const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
+            if (match) {
+              gpus.push({
+                id: `intel-${index}`,
+                vendor: 'intel',
+                model: match[2].trim(),
+                vram: 0, // Intel Arc VRAM detection needs sysfs
+                pciSlot: match[1],
+                pciBusId: match[1],
+                index: index++,
+              });
+            }
+          }
+        }
+      } catch {
+        logger.dim('Intel GPU detection: xpu-smi and lspci detection failed');
+      }
+    }
+
+    return gpus;
+  }
+
+  /**
+   * Generic GPU detection using lspci
+   */
+  private async detectGenericGpus(): Promise<IGpuInfo[]> {
+    const gpus: IGpuInfo[] = [];
+
+    try {
+      const { stdout } = await execAsync(
+        'lspci -nn | grep -i "VGA\\|3D\\|Display"',
+        { timeout: 5000 },
+      );
+
+      const lines = stdout.trim().split('\n').filter((l: string) => l.trim());
+      let index = 0;
+
+      for (const line of lines) {
+        const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
+        if (match) {
+          const model = match[2].trim();
+          let vendor: TGpuVendor = 'unknown';
+
+          if (/nvidia/i.test(model)) vendor = 'nvidia';
+          else if (/amd|ati|radeon/i.test(model)) vendor = 'amd';
+          else if (/intel/i.test(model)) vendor = 'intel';
+
+          gpus.push({
+            id: `gpu-${index}`,
+            vendor,
+            model,
+            vram: 0,
+            pciSlot: match[1],
+            pciBusId: match[1],
+            index: index++,
+          });
+        }
+      }
+    } catch {
+      logger.dim('Generic GPU detection: lspci not available');
+    }
+
+    return gpus;
+  }
+
+  /**
+   * Get real-time status for a specific GPU
+   */
+  public async getGpuStatus(gpuId: string): Promise<IGpuStatus | null> {
+    const gpus = await this.detectGpus();
+    const gpu = gpus.find((g) => g.id === gpuId);
+
+    if (!gpu) {
+      return null;
+    }
+
+    if (gpu.vendor === 'nvidia') {
+      return this.getNvidiaGpuStatus(gpu);
+    } else if (gpu.vendor === 'amd') {
+      return this.getAmdGpuStatus(gpu);
+    } else if (gpu.vendor === 'intel') {
+      return this.getIntelGpuStatus(gpu);
+    }
+
+    // Unknown vendor - return basic status
+    return {
+      id: gpuId,
+      utilization: 0,
+      memoryUsed: 0,
+      memoryTotal: gpu.vram,
+      memoryPercent: 0,
+      temperature: 0,
+      powerUsage: 0,
+      powerLimit: 0,
+      lastUpdate: Date.now(),
+    };
+  }
+
+  /**
+   * Get real-time status for all GPUs
+   */
+  public async getAllGpuStatus(): Promise<Map<string, IGpuStatus>> {
+    const statuses = new Map<string, IGpuStatus>();
+    const gpus = await this.detectGpus();
+
+    for (const gpu of gpus) {
+      const status = await this.getGpuStatus(gpu.id);
+      if (status) {
+        statuses.set(gpu.id, status);
+      }
+    }
+
+    return statuses;
+  }
+
+  /**
+   * Get NVIDIA GPU status using nvidia-smi
+   */
+  private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
+    try {
+      const { stdout } = await execAsync(
+        `nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`,
+        { timeout: 5000 },
+      );
+
+      const parts = stdout.trim().split(',').map((p: string) => p.trim());
+      const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;
+
+      return {
+        id: gpu.id,
+        utilization: parseInt(utilization, 10) || 0,
+        memoryUsed: parseInt(memUsed, 10) || 0,
+        memoryTotal: parseInt(memTotal, 10) || gpu.vram,
+        memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
+        temperature: parseInt(temp, 10) || 0,
+        powerUsage: parseFloat(power) || 0,
+        powerLimit: parseFloat(powerLimit) || 0,
+        fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined,
+        gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined,
+        memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined,
+        lastUpdate: Date.now(),
+      };
+    } catch {
+      return {
+        id: gpu.id,
+        utilization: 0,
+        memoryUsed: 0,
+        memoryTotal: gpu.vram,
+        memoryPercent: 0,
+        temperature: 0,
+        powerUsage: 0,
+        powerLimit: 0,
+        lastUpdate: Date.now(),
+      };
+    }
+  }
+
+  /**
+   * Get AMD GPU status using rocm-smi
+   */
+  private async getAmdGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
+    try {
+      const { stdout } = await execAsync(
+        `rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`,
+        { timeout: 5000 },
+      );
+
+      const data = JSON.parse(stdout);
+      const cardKey = `card${gpu.index}`;
+      const cardData = data[cardKey] || {};
+
+      return {
+        id: gpu.id,
+        utilization: parseInt(cardData['GPU use (%)'] || '0', 10),
+        memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'),
+        memoryTotal: gpu.vram,
+        memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10),
+        temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'),
+        powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'),
+        powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'),
+        lastUpdate: Date.now(),
+      };
+    } catch {
+      return {
+        id: gpu.id,
+        utilization: 0,
+        memoryUsed: 0,
+        memoryTotal: gpu.vram,
+        memoryPercent: 0,
+        temperature: 0,
+        powerUsage: 0,
+        powerLimit: 0,
+        lastUpdate: Date.now(),
+      };
+    }
+  }
+
+  /**
+   * Get Intel GPU status using xpu-smi
+   */
+  private async getIntelGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
+    try {
+      const { stdout } = await execAsync(
+        `xpu-smi stats -d ${gpu.index} --json 2>/dev/null`,
+        { timeout: 5000 },
+      );
+
+      const data = JSON.parse(stdout);
+      const stats = data.device_level || {};
+
+      return {
+        id: gpu.id,
+        utilization: Math.round(parseFloat(stats.gpu_utilization || '0')),
+        memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)),
+        memoryTotal: gpu.vram,
+        memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')),
+        temperature: parseFloat(stats.gpu_temperature || '0'),
+        powerUsage: parseFloat(stats.power || '0'),
+        powerLimit: 0, // Intel doesn't expose this easily
+        lastUpdate: Date.now(),
+      };
+    } catch {
+      return {
+        id: gpu.id,
+        utilization: 0,
+        memoryUsed: 0,
+        memoryTotal: gpu.vram,
+        memoryPercent: 0,
+        temperature: 0,
+        powerUsage: 0,
+        powerLimit: 0,
+        lastUpdate: Date.now(),
+      };
+    }
+  }
+
+  /**
+   * Helper to extract PCI slot from full bus ID
+   */
+  private extractPciSlot(pciId: string): string {
+    // Input: "00000000:01:00.0" -> Output: "01:00.0"
+    const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i);
+    return match ? match[1] : pciId;
+  }
+
+  /**
+   * Helper to parse memory values with units
+   */
+  private parseMemory(value: string): number {
+    const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i);
+    if (!match) return 0;
+
+    let bytes = parseFloat(match[1]);
+    const unit = (match[2] || 'B').toUpperCase();
+
+    switch (unit) {
+      case 'TB':
+        bytes *= 1024;
+        // falls through
+      case 'GB':
+        bytes *= 1024;
+        // falls through
+      case 'MB':
+        break; // Already in MB
+      case 'KB':
+        bytes /= 1024;
+        break;
+      case 'B':
+        bytes /= (1024 * 1024);
+        break;
+    }
+
+    return Math.round(bytes);
+  }
+
+  /**
+   * Get AMD VRAM from sysfs (async)
+   */
+  private async getAmdVramFromSysfs(pciBusId: string): Promise<number> {
+    try {
+      const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`;
+      const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false);
+      if (exists) {
+        const content = await fs.promises.readFile(sysfsPath, 'utf8');
+        return Math.round(parseInt(content.trim(), 10) / (1024 * 1024));
+      }
+    } catch {
+      // sysfs not available
+    }
+    return 0;
+  }
+
+  /**
+   * Get ROCm version
+   */
+  private async getRocmVersion(): Promise<string | undefined> {
+    try {
+      const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
+      const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
+      return match ? match[1] : undefined;
+    } catch {
+      return undefined;
+    }
+  }
+
+  /**
+   * Get oneAPI version
+   */
+  private async getOneApiVersion(): Promise<string | undefined> {
+    try {
+      const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
+      const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
+      return match ? match[1] : undefined;
+    } catch {
+      return undefined;
+    }
+  }
+}