modelgrid/ts/hardware/gpu-detector.ts

/**
 * GPU Detector
 *
 * Detects GPUs on the system (NVIDIA, AMD, Intel Arc) and retrieves their information.
 */

import { exec } from 'node:child_process';
import { promisify } from 'node:util';
import * as fs from 'node:fs';
import type { IGpuInfo, IGpuStatus, TGpuVendor } from '../interfaces/gpu.ts';
import { logger } from '../logger.ts';
import { TIMING } from '../constants.ts';

const execAsync = promisify(exec);

/**
 * GPU Detector class for detecting and querying GPU information
 */
export class GpuDetector {
  private cachedGpus: IGpuInfo[] | null = null;
  private cacheTime: number = 0;
  private readonly cacheDuration = TIMING.GPU_DETECTION_TIMEOUT_MS;

  /**
   * Detect all GPUs on the system
   * @param forceRefresh Force refresh even if cache is valid
   * @returns Array of detected GPU information
   */
  public async detectGpus(forceRefresh: boolean = false): Promise<IGpuInfo[]> {
    // Return cached data if still valid
    if (!forceRefresh && this.cachedGpus && Date.now() - this.cacheTime < this.cacheDuration) {
      return this.cachedGpus;
    }

    const gpus: IGpuInfo[] = [];

    // Detect NVIDIA GPUs
    const nvidiaGpus = await this.detectNvidiaGpus();
    gpus.push(...nvidiaGpus);

    // Detect AMD GPUs
    const amdGpus = await this.detectAmdGpus();
    gpus.push(...amdGpus);

    // Detect Intel GPUs
    const intelGpus = await this.detectIntelGpus();
    gpus.push(...intelGpus);

    // If no GPUs found via specific tools, try generic detection
    if (gpus.length === 0) {
      const genericGpus = await this.detectGenericGpus();
      gpus.push(...genericGpus);
    }

    // Update cache
    this.cachedGpus = gpus;
    this.cacheTime = Date.now();

    return gpus;
  }

  /**
   * Detect NVIDIA GPUs using nvidia-smi
   */
  private async detectNvidiaGpus(): Promise<IGpuInfo[]> {
    const gpus: IGpuInfo[] = [];

    try {
      // Check if nvidia-smi is available
      const { stdout } = await execAsync(
        'nvidia-smi --query-gpu=index,gpu_uuid,name,memory.total,driver_version,pci.bus_id,compute_cap --format=csv,noheader,nounits',
        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
      );

      const lines = stdout.trim().split('\n').filter((line: string) => line.trim());

      for (const line of lines) {
        const parts = line.split(',').map((p: string) => p.trim());
        if (parts.length >= 7) {
          const [index, _uuid, name, memory, driver, pciId, computeCap] = parts;

          gpus.push({
            id: `nvidia-${index}`,
            vendor: 'nvidia',
            model: name,
            vram: parseInt(memory, 10), // Already in MB
            driverVersion: driver,
            computeCapability: computeCap,
            pciSlot: this.extractPciSlot(pciId),
            pciBusId: pciId,
            index: parseInt(index, 10),
          });
        }
      }

      // Get CUDA version separately
      if (gpus.length > 0) {
        try {
          const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
            timeout: 5000,
          });
          const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
          if (cudaMatch) {
            for (const gpu of gpus) {
              gpu.cudaVersion = cudaMatch[1];
            }
          }
        } catch {
          // CUDA version detection failed, that's okay
        }
      }
    } catch {
      // nvidia-smi not available or failed
      logger.dim('NVIDIA GPU detection: nvidia-smi not available');
    }

    return gpus;
  }

  /**
   * Detect AMD GPUs using rocm-smi or amdgpu-ls
   */
  private async detectAmdGpus(): Promise<IGpuInfo[]> {
    const gpus: IGpuInfo[] = [];

    try {
      // Try rocm-smi first
      const { stdout } = await execAsync(
        'rocm-smi --showproductname --showmeminfo vram --showdriverversion --showbus --csv 2>/dev/null || rocm-smi -a --json 2>/dev/null',
        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
      );

      // Parse rocm-smi output
      if (stdout.includes('{')) {
        // JSON output
        const data = JSON.parse(stdout);
        let index = 0;
        for (const [key, value] of Object.entries(data)) {
          if (key.startsWith('card')) {
            const cardData = value as Record<string, unknown>;
            gpus.push({
              id: `amd-${index}`,
              vendor: 'amd',
              model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
              vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
              driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
              rocmVersion: await this.getRocmVersion(),
              pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
              pciBusId: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
              index: index++,
            });
          }
        }
      } else {
        // CSV output - parse line by line
        const lines = stdout.trim().split('\n');
        let index = 0;
        for (const line of lines) {
          if (line.includes('GPU') || line.includes('Radeon') || line.includes('AMD')) {
            // This is a GPU entry
            gpus.push({
              id: `amd-${index}`,
              vendor: 'amd',
              model: line.trim(),
              vram: 0, // Will need additional parsing
              pciSlot: '',
              index: index++,
            });
          }
        }
      }
    } catch {
      // rocm-smi not available, try lspci
      try {
        const { stdout: lspciOut } = await execAsync(
          'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "AMD\\|ATI\\|Radeon"',
          { timeout: 5000 },
        );

        const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
        let index = 0;
        for (const line of lines) {
          const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
          if (match) {
            gpus.push({
              id: `amd-${index}`,
              vendor: 'amd',
              model: match[2].trim(),
              vram: await this.getAmdVramFromSysfs(match[1]),
              pciSlot: match[1],
              pciBusId: match[1],
              index: index++,
            });
          }
        }
      } catch {
        logger.dim('AMD GPU detection: rocm-smi and lspci detection failed');
      }
    }

    return gpus;
  }

  /**
   * Detect Intel GPUs using intel_gpu_top or xpu-smi
   */
  private async detectIntelGpus(): Promise<IGpuInfo[]> {
    const gpus: IGpuInfo[] = [];

    try {
      // Try xpu-smi first (for Intel Arc GPUs)
      const { stdout } = await execAsync(
        'xpu-smi discovery --json 2>/dev/null',
        { timeout: TIMING.GPU_DETECTION_TIMEOUT_MS },
      );

      const data = JSON.parse(stdout);
      if (data.device_list) {
        let index = 0;
        for (const device of data.device_list) {
          gpus.push({
            id: `intel-${index}`,
            vendor: 'intel',
            model: device.device_name || 'Intel GPU',
            vram: device.memory_physical_size_byte
              ? Math.round(device.memory_physical_size_byte / (1024 * 1024))
              : 0,
            oneApiVersion: await this.getOneApiVersion(),
            pciSlot: device.pci_bdf || '',
            pciBusId: device.pci_bdf || '',
            index: index++,
          });
        }
      }
    } catch {
      // xpu-smi not available, try lspci
      try {
        const { stdout: lspciOut } = await execAsync(
          'lspci -nn | grep -i "VGA\\|3D\\|Display" | grep -i "Intel.*Arc\\|Intel.*Graphics"',
          { timeout: 5000 },
        );

        const lines = lspciOut.trim().split('\n').filter((l: string) => l.trim());
        let index = 0;
        for (const line of lines) {
          // Skip integrated graphics, only look for discrete Arc GPUs
          if (line.toLowerCase().includes('arc')) {
            const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
            if (match) {
              gpus.push({
                id: `intel-${index}`,
                vendor: 'intel',
                model: match[2].trim(),
                vram: 0, // Intel Arc VRAM detection needs sysfs
                pciSlot: match[1],
                pciBusId: match[1],
                index: index++,
              });
            }
          }
        }
      } catch {
        logger.dim('Intel GPU detection: xpu-smi and lspci detection failed');
      }
    }

    return gpus;
  }

  /**
   * Generic GPU detection using lspci
   */
  private async detectGenericGpus(): Promise<IGpuInfo[]> {
    const gpus: IGpuInfo[] = [];

    try {
      const { stdout } = await execAsync(
        'lspci -nn | grep -i "VGA\\|3D\\|Display"',
        { timeout: 5000 },
      );

      const lines = stdout.trim().split('\n').filter((l: string) => l.trim());
      let index = 0;

      for (const line of lines) {
        const match = line.match(/^([0-9a-f:.]+)\s+.*:\s+(.+)$/i);
        if (match) {
          const model = match[2].trim();
          let vendor: TGpuVendor = 'unknown';

          if (/nvidia/i.test(model)) vendor = 'nvidia';
          else if (/amd|ati|radeon/i.test(model)) vendor = 'amd';
          else if (/intel/i.test(model)) vendor = 'intel';

          gpus.push({
            id: `gpu-${index}`,
            vendor,
            model,
            vram: 0,
            pciSlot: match[1],
            pciBusId: match[1],
            index: index++,
          });
        }
      }
    } catch {
      logger.dim('Generic GPU detection: lspci not available');
    }

    return gpus;
  }

  /**
   * Get real-time status for a specific GPU
   */
  public async getGpuStatus(gpuId: string): Promise<IGpuStatus | null> {
    const gpus = await this.detectGpus();
    const gpu = gpus.find((g) => g.id === gpuId);

    if (!gpu) {
      return null;
    }

    if (gpu.vendor === 'nvidia') {
      return this.getNvidiaGpuStatus(gpu);
    } else if (gpu.vendor === 'amd') {
      return this.getAmdGpuStatus(gpu);
    } else if (gpu.vendor === 'intel') {
      return this.getIntelGpuStatus(gpu);
    }

    // Unknown vendor - return basic status
    return {
      id: gpuId,
      utilization: 0,
      memoryUsed: 0,
      memoryTotal: gpu.vram,
      memoryPercent: 0,
      temperature: 0,
      powerUsage: 0,
      powerLimit: 0,
      lastUpdate: Date.now(),
    };
  }

  /**
   * Get real-time status for all GPUs
   */
  public async getAllGpuStatus(): Promise<Map<string, IGpuStatus>> {
    const statuses = new Map<string, IGpuStatus>();
    const gpus = await this.detectGpus();

    for (const gpu of gpus) {
      const status = await this.getGpuStatus(gpu.id);
      if (status) {
        statuses.set(gpu.id, status);
      }
    }

    return statuses;
  }

  /**
   * Get NVIDIA GPU status using nvidia-smi
   */
  private async getNvidiaGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
    try {
      const { stdout } = await execAsync(
        `nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,clocks.gr,clocks.mem --format=csv,noheader,nounits -i ${gpu.index}`,
        { timeout: 5000 },
      );

      const parts = stdout.trim().split(',').map((p: string) => p.trim());
      const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;

      return {
        id: gpu.id,
        utilization: parseInt(utilization, 10) || 0,
        memoryUsed: parseInt(memUsed, 10) || 0,
        memoryTotal: parseInt(memTotal, 10) || gpu.vram,
        memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
        temperature: parseInt(temp, 10) || 0,
        powerUsage: parseFloat(power) || 0,
        powerLimit: parseFloat(powerLimit) || 0,
        fanSpeed: fan !== '[N/A]' ? parseInt(fan, 10) : undefined,
        gpuClock: gpuClock !== '[N/A]' ? parseInt(gpuClock, 10) : undefined,
        memoryClock: memClock !== '[N/A]' ? parseInt(memClock, 10) : undefined,
        lastUpdate: Date.now(),
      };
    } catch {
      return {
        id: gpu.id,
        utilization: 0,
        memoryUsed: 0,
        memoryTotal: gpu.vram,
        memoryPercent: 0,
        temperature: 0,
        powerUsage: 0,
        powerLimit: 0,
        lastUpdate: Date.now(),
      };
    }
  }

  /**
   * Get AMD GPU status using rocm-smi
   */
  private async getAmdGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
    try {
      const { stdout } = await execAsync(
        `rocm-smi -d ${gpu.index} --showuse --showmemuse --showtemp --showpower --json 2>/dev/null`,
        { timeout: 5000 },
      );

      const data = JSON.parse(stdout);
      const cardKey = `card${gpu.index}`;
      const cardData = data[cardKey] || {};

      return {
        id: gpu.id,
        utilization: parseInt(cardData['GPU use (%)'] || '0', 10),
        memoryUsed: this.parseMemory(cardData['GPU memory use (%)'] || '0'),
        memoryTotal: gpu.vram,
        memoryPercent: parseInt(cardData['GPU memory use (%)'] || '0', 10),
        temperature: parseFloat(cardData['Temperature (Sensor edge) (C)'] || '0'),
        powerUsage: parseFloat(cardData['Average Graphics Package Power (W)'] || '0'),
        powerLimit: parseFloat(cardData['Max Graphics Package Power (W)'] || '0'),
        lastUpdate: Date.now(),
      };
    } catch {
      return {
        id: gpu.id,
        utilization: 0,
        memoryUsed: 0,
        memoryTotal: gpu.vram,
        memoryPercent: 0,
        temperature: 0,
        powerUsage: 0,
        powerLimit: 0,
        lastUpdate: Date.now(),
      };
    }
  }

  /**
   * Get Intel GPU status using xpu-smi
   */
  private async getIntelGpuStatus(gpu: IGpuInfo): Promise<IGpuStatus> {
    try {
      const { stdout } = await execAsync(
        `xpu-smi stats -d ${gpu.index} --json 2>/dev/null`,
        { timeout: 5000 },
      );

      const data = JSON.parse(stdout);
      const stats = data.device_level || {};

      return {
        id: gpu.id,
        utilization: Math.round(parseFloat(stats.gpu_utilization || '0')),
        memoryUsed: Math.round(parseFloat(stats.memory_used || '0') / (1024 * 1024)),
        memoryTotal: gpu.vram,
        memoryPercent: Math.round(parseFloat(stats.memory_utilization || '0')),
        temperature: parseFloat(stats.gpu_temperature || '0'),
        powerUsage: parseFloat(stats.power || '0'),
        powerLimit: 0, // Intel doesn't expose this easily
        lastUpdate: Date.now(),
      };
    } catch {
      return {
        id: gpu.id,
        utilization: 0,
        memoryUsed: 0,
        memoryTotal: gpu.vram,
        memoryPercent: 0,
        temperature: 0,
        powerUsage: 0,
        powerLimit: 0,
        lastUpdate: Date.now(),
      };
    }
  }

  /**
   * Helper to extract PCI slot from full bus ID
   */
  private extractPciSlot(pciId: string): string {
    // Input: "00000000:01:00.0" -> Output: "01:00.0"
    const match = pciId.match(/([0-9a-f]+:[0-9a-f]+\.[0-9a-f]+)$/i);
    return match ? match[1] : pciId;
  }

  /**
   * Helper to parse memory values with units
   */
  private parseMemory(value: string): number {
    const match = value.match(/(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?/i);
    if (!match) return 0;

    let bytes = parseFloat(match[1]);
    const unit = (match[2] || 'B').toUpperCase();

    switch (unit) {
      case 'TB':
        bytes *= 1024;
        // falls through
      case 'GB':
        bytes *= 1024;
        // falls through
      case 'MB':
        break; // Already in MB
      case 'KB':
        bytes /= 1024;
        break;
      case 'B':
        bytes /= (1024 * 1024);
        break;
    }

    return Math.round(bytes);
  }

  /**
   * Get AMD VRAM from sysfs (async)
   */
  private async getAmdVramFromSysfs(pciBusId: string): Promise<number> {
    try {
      const sysfsPath = `/sys/bus/pci/devices/0000:${pciBusId}/mem_info_vram_total`;
      const exists = await fs.promises.access(sysfsPath).then(() => true).catch(() => false);
      if (exists) {
        const content = await fs.promises.readFile(sysfsPath, 'utf8');
        return Math.round(parseInt(content.trim(), 10) / (1024 * 1024));
      }
    } catch {
      // sysfs not available
    }
    return 0;
  }

  /**
   * Get ROCm version
   */
  private async getRocmVersion(): Promise<string | undefined> {
    try {
      const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
      const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
      return match ? match[1] : undefined;
    } catch {
      return undefined;
    }
  }

  /**
   * Get oneAPI version
   */
  private async getOneApiVersion(): Promise<string | undefined> {
    try {
      const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
      const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
      return match ? match[1] : undefined;
    } catch {
      return undefined;
    }
  }
}