modelgrid/ts/cluster/coordinator.ts

import type {
  IClusterDesiredDeployment,
  IClusterEnsureResponse,
  IClusterNodeHeartbeat,
  IClusterNodeResources,
  IClusterStatusResponse,
  TClusterNodeSchedulerState,
} from '../interfaces/cluster.ts';
import { ContainerManager } from '../containers/container-manager.ts';
import { GpuDetector } from '../hardware/gpu-detector.ts';
import { logger } from '../logger.ts';
import { ModelRegistry } from '../models/registry.ts';
import { ModelLoader } from '../models/loader.ts';
import { CLUSTER } from '../constants.ts';
import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
import { ClusterManager } from './cluster-manager.ts';

export class ClusterCoordinator {
  private clusterManager: ClusterManager;
  private containerManager: ContainerManager;
  private modelRegistry: ModelRegistry;
  private modelLoader: ModelLoader;
  private gpuDetector: GpuDetector;

  constructor(
    clusterManager: ClusterManager,
    containerManager: ContainerManager,
    modelRegistry: ModelRegistry,
    modelLoader: ModelLoader,
  ) {
    this.clusterManager = clusterManager;
    this.containerManager = containerManager;
    this.modelRegistry = modelRegistry;
    this.modelLoader = modelLoader;
    this.gpuDetector = new GpuDetector();
  }

  public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
    const [gpus, statuses, models] = await Promise.all([
      this.gpuDetector.detectGpus(),
      this.containerManager.getAllStatus(),
      this.containerManager.getAllAvailableModels(),
    ]);

    const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
    const runningContainers = this.containerManager.getAllContainers().filter((container) => {
      const status = statuses.get(container.getConfig().id);
      return status?.running === true;
    });
    const resources = await this.buildResourceSummary(
      gpus,
      deploymentCount,
      models,
      runningContainers,
    );

    return {
      nodeName: this.clusterManager.getConfig().nodeName,
      role: this.clusterManager.getConfig().role,
      endpoint,
      healthy: true,
      resources,
      deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
        modelId,
        engine: 'vllm' as const,
        endpoint,
        healthy: endpoints.some((entry) => entry.healthy),
        containerId: endpoints[0]?.containerId,
      })),
      lastSeenAt: Date.now(),
    };
  }

  public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
    const heartbeat = await this.buildLocalHeartbeat(endpoint);
    this.clusterManager.updateLocalNode(heartbeat);
    return heartbeat;
  }

  public async sendHeartbeat(): Promise<void> {
    if (!this.clusterManager.isEnabled()) {
      return;
    }

    const endpoint = this.clusterManager.getAdvertisedEndpoint();
    const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
    if (!endpoint || !controlPlaneUrl) {
      return;
    }

    if (controlPlaneUrl === endpoint) {
      return;
    }

    const heartbeat = await this.syncLocalState(endpoint);

    try {
      await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
        method: 'POST',
        headers: this.buildClusterHeaders(),
        body: JSON.stringify(heartbeat),
      });
    } catch (error) {
      logger.warn(
        `Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
      );
    }
  }

  public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
    this.clusterManager.upsertNode(heartbeat);
  }

  public getStatus(): IClusterStatusResponse {
    return this.clusterManager.getStatus();
  }

  public getDesiredDeployments(): IClusterDesiredDeployment[] {
    return this.clusterManager.getDesiredDeployments();
  }

  public getLocalNodeName(): string {
    return this.clusterManager.getConfig().nodeName;
  }

  public getSharedSecret(): string | undefined {
    return this.clusterManager.getSharedSecret();
  }

  public setNodeSchedulerState(
    nodeName: string,
    schedulerState: TClusterNodeSchedulerState,
  ): TClusterNodeSchedulerState {
    return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
  }

  public async setDesiredReplicas(
    modelName: string,
    desiredReplicas: number,
  ): Promise<IClusterDesiredDeployment | null> {
    const model = await this.modelRegistry.getModel(modelName);
    if (!model) {
      return null;
    }

    if (desiredReplicas <= 0) {
      this.clusterManager.removeDesiredDeployment(model.id);
      return {
        modelId: model.id,
        desiredReplicas: 0,
        updatedAt: Date.now(),
      };
    }

    return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
  }

  public async clearDesiredDeployment(modelName: string): Promise<boolean> {
    const model = await this.modelRegistry.getModel(modelName);
    if (!model) {
      return false;
    }

    return this.clusterManager.removeDesiredDeployment(model.id);
  }

  public shouldDeployLocallyFirst(): boolean {
    if (!this.clusterManager.isEnabled()) {
      return true;
    }

    return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
  }

  public canManageClusterState(): boolean {
    return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
  }

  public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
    const model = await this.modelRegistry.getModel(modelName);
    if (!model) {
      return null;
    }

    const location = this.clusterManager.resolveModel(model.id);
    if (!location) {
      return null;
    }

    return {
      model: model.id,
      location,
      created: false,
    };
  }

  public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
    const model = await this.modelRegistry.getModel(modelName);
    if (!model) {
      return null;
    }

    this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);

    const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
    if (existing) {
      return {
        model: model.id,
        location: existing,
        created: false,
      };
    }

    if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
      const local = await this.deployModelLocally(model.id);
      if (!local) {
        return null;
      }

      return local;
    }

    const targetNode = this.clusterManager.pickNodeForModel(model);
    if (!targetNode) {
      return null;
    }

    if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
      return this.deployModelLocally(model.id);
    }

    return this.requestRemoteDeployment(targetNode.endpoint, model.id);
  }

  public async ensureModelViaControlPlane(
    modelName: string,
  ): Promise<IClusterEnsureResponse | null> {
    const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
    const localEndpoint = this.clusterManager.getAdvertisedEndpoint();

    if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
      return this.ensureModel(modelName);
    }

    try {
      const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
        method: 'POST',
        headers: this.buildClusterHeaders(),
        body: JSON.stringify({ model: modelName }),
      });

      if (!response.ok) {
        return null;
      }

      return await response.json() as IClusterEnsureResponse;
    } catch {
      return null;
    }
  }

  public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
    const model = await this.modelRegistry.getModel(modelName);
    if (model) {
      this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
    }

    const result = await this.modelLoader.loadModel(modelName);
    if (!result.success) {
      return null;
    }

    const endpoint = this.clusterManager.getAdvertisedEndpoint();
    if (endpoint) {
      await this.syncLocalState(endpoint);
    }

    const resolved = await this.resolveModel(result.model);
    if (!resolved) {
      return null;
    }

    return {
      ...resolved,
      created: !result.alreadyLoaded,
    };
  }

  public async reconcileDesiredReplicas(): Promise<void> {
    if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
      return;
    }

    const desiredDeployments = this.clusterManager.getDesiredDeployments();
    for (const desiredDeployment of desiredDeployments) {
      if (desiredDeployment.desiredReplicas <= 0) {
        continue;
      }

      const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
      if (!model) {
        continue;
      }

      const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
      const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
      if (missingReplicas <= 0) {
        continue;
      }

      for (let index = 0; index < missingReplicas; index++) {
        const targetNode = this.clusterManager.pickNodeForModel(model);
        if (!targetNode) {
          break;
        }

        const replicaOrdinal = existingLocations.length + index;
        const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
          ? await this.deployReplicaLocally(model.id, replicaOrdinal)
          : await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);

        if (!result) {
          break;
        }
      }
    }
  }

  public async deployReplicaLocally(
    modelName: string,
    replicaOrdinal?: number,
  ): Promise<IClusterEnsureResponse | null> {
    const model = await this.modelRegistry.getModel(modelName);
    if (model) {
      this.rememberDesiredDeployment(
        model.id,
        Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
      );
    }

    const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
    if (!result.success) {
      return null;
    }

    const endpoint = this.clusterManager.getAdvertisedEndpoint();
    if (endpoint) {
      await this.syncLocalState(endpoint);
    }

    const resolved = await this.resolveModel(result.model);
    if (!resolved) {
      return null;
    }

    return {
      ...resolved,
      created: !result.alreadyLoaded,
    };
  }

  private async requestRemoteDeployment(
    nodeEndpoint: string,
    modelName: string,
    replicaOrdinal?: number,
  ): Promise<IClusterEnsureResponse | null> {
    try {
      const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
        method: 'POST',
        headers: this.buildClusterHeaders(),
        body: JSON.stringify({ model: modelName, replicaOrdinal }),
      });

      if (!response.ok) {
        return null;
      }

      return await response.json() as IClusterEnsureResponse;
    } catch {
      return null;
    }
  }

  private async buildResourceSummary(
    gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
    deploymentCount: number,
    _models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
    runningContainers: ReturnType<ContainerManager['getAllContainers']>,
  ): Promise<IClusterNodeResources> {
    const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
    const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
    const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
    const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
    const availableVramGb = Math.round(
      availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
    );

    const maxSingleGpuVramGb = availableGpus.length > 0
      ? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
      : 0;
    const largestGpuGroupCount = topologyGroups.length > 0
      ? Math.max(...topologyGroups.map((group) => group.gpuCount))
      : 0;
    const largestGpuGroupVramGb = topologyGroups.length > 0
      ? Math.max(...topologyGroups.map((group) => group.totalVramGb))
      : 0;

    return {
      gpuCount: gpus.length,
      totalVramGb,
      availableVramGb,
      maxSingleGpuVramGb,
      largestGpuGroupCount,
      largestGpuGroupVramGb,
      deploymentCount,
      topologyGroups,
    };
  }

  private buildClusterHeaders(): Record<string, string> {
    const headers: Record<string, string> = {
      'Content-Type': 'application/json',
    };

    const sharedSecret = this.clusterManager.getSharedSecret();
    if (sharedSecret) {
      headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
    }

    return headers;
  }

  private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
    const existing = this.clusterManager.getDesiredDeployment(modelId);
    const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
    this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
  }
}