feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -0,0 +1,456 @@
+import os from 'node:os';
+import * as fs from 'node:fs/promises';
+import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
+import type {
+  IClusterConfig,
+  IClusterDesiredDeployment,
+  IClusterGpuTopologyGroup,
+  IClusterModelLocation,
+  IClusterNodeHeartbeat,
+  IClusterNodeStatus,
+  IClusterStatusResponse,
+  TClusterNodeSchedulerState,
+} from '../interfaces/cluster.ts';
+import { CLUSTER, PATHS } from '../constants.ts';
+
+export class ClusterManager {
+  private config: IClusterConfig = {
+    enabled: false,
+    nodeName: os.hostname(),
+    role: 'standalone',
+    bindHost: CLUSTER.DEFAULT_BIND_HOST,
+    gossipPort: CLUSTER.DEFAULT_GOSSIP_PORT,
+    heartbeatIntervalMs: CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
+    seedNodes: [],
+  };
+  private localNode: IClusterNodeHeartbeat | null = null;
+  private knownNodes = new Map<string, IClusterNodeHeartbeat>();
+  private desiredDeployments = new Map<string, IClusterDesiredDeployment>();
+  private nodeSchedulerStates = new Map<string, TClusterNodeSchedulerState>();
+  private persistQueued = false;
+  private controlPersistQueued = false;
+
+  public async initialize(): Promise<void> {
+    try {
+      const stateContent = await fs.readFile(this.getStateFilePath(), 'utf-8');
+      const data = JSON.parse(stateContent) as { nodes?: IClusterNodeHeartbeat[] };
+
+      for (const node of data.nodes || []) {
+        this.knownNodes.set(node.nodeName, node);
+        if (node.nodeName === this.config.nodeName) {
+          this.localNode = node;
+        }
+      }
+
+      this.pruneStaleNodes();
+    } catch {
+      // No persisted cluster state yet.
+    }
+
+    try {
+      const controlStateContent = await fs.readFile(this.getControlStateFilePath(), 'utf-8');
+      const data = JSON.parse(controlStateContent) as {
+        desiredDeployments?: IClusterDesiredDeployment[];
+        nodeSchedulerStates?: Record<string, TClusterNodeSchedulerState>;
+      };
+
+      for (const deployment of data.desiredDeployments || []) {
+        this.desiredDeployments.set(deployment.modelId, deployment);
+      }
+
+      for (const [nodeName, schedulerState] of Object.entries(data.nodeSchedulerStates || {})) {
+        this.nodeSchedulerStates.set(nodeName, schedulerState);
+      }
+    } catch {
+      // No persisted control state yet.
+    }
+  }
+
+  public configure(config: IClusterConfig): void {
+    this.config = {
+      ...config,
+      heartbeatIntervalMs: config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
+      seedNodes: config.seedNodes || [],
+    };
+  }
+
+  public getConfig(): IClusterConfig {
+    return this.config;
+  }
+
+  public isEnabled(): boolean {
+    return this.config.enabled;
+  }
+
+  public isControlPlane(): boolean {
+    return this.config.enabled && this.config.role === 'control-plane';
+  }
+
+  public isWorker(): boolean {
+    return this.config.enabled && this.config.role === 'worker';
+  }
+
+  public getModeLabel(): string {
+    if (!this.config.enabled) {
+      return 'standalone';
+    }
+
+    return this.config.role;
+  }
+
+  public getHeartbeatIntervalMs(): number {
+    return this.config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS;
+  }
+
+  public getAdvertisedEndpoint(): string | undefined {
+    return this.localNode?.endpoint || this.config.advertiseUrl;
+  }
+
+  public getControlPlaneUrl(): string | undefined {
+    return this.config.controlPlaneUrl;
+  }
+
+  public getSharedSecret(): string | undefined {
+    return this.config.sharedSecret || undefined;
+  }
+
+  public updateLocalNode(heartbeat: IClusterNodeHeartbeat): void {
+    this.localNode = heartbeat;
+    this.knownNodes.set(heartbeat.nodeName, heartbeat);
+    this.schedulePersist();
+  }
+
+  public upsertNode(heartbeat: IClusterNodeHeartbeat): void {
+    this.knownNodes.set(heartbeat.nodeName, heartbeat);
+    this.schedulePersist();
+  }
+
+  public getLocalNodeStatus(): IClusterNodeStatus {
+    return {
+      nodeName: this.config.nodeName,
+      role: this.config.role,
+      endpoint: this.getAdvertisedEndpoint(),
+      healthy: true,
+      schedulerState: this.getNodeSchedulerState(this.config.nodeName),
+    };
+  }
+
+  public getLocalNode(): IClusterNodeHeartbeat | null {
+    return this.localNode;
+  }
+
+  public getNode(nodeName: string): IClusterNodeHeartbeat | null {
+    const node = this.knownNodes.get(nodeName);
+    if (!node) {
+      return null;
+    }
+
+    return this.decorateNode(node);
+  }
+
+  public pruneStaleNodes(): void {
+    const now = Date.now();
+    for (const [nodeName, node] of this.knownNodes) {
+      if (nodeName === this.config.nodeName) {
+        continue;
+      }
+
+      if (now - node.lastSeenAt > CLUSTER.NODE_STALE_AFTER_MS) {
+        this.knownNodes.delete(nodeName);
+        this.schedulePersist();
+      }
+    }
+  }
+
+  public getAllNodes(): IClusterNodeHeartbeat[] {
+    this.pruneStaleNodes();
+    return Array.from(this.knownNodes.values()).map((node) => this.decorateNode(node)).sort(
+      (left, right) => {
+        if (left.nodeName === this.config.nodeName) {
+          return -1;
+        }
+        if (right.nodeName === this.config.nodeName) {
+          return 1;
+        }
+        return left.nodeName.localeCompare(right.nodeName);
+      },
+    );
+  }
+
+  public getHealthyNodes(): IClusterNodeHeartbeat[] {
+    return this.getAllNodes().filter((node) => node.healthy);
+  }
+
+  public getNodeSchedulerState(nodeName: string): TClusterNodeSchedulerState {
+    return this.nodeSchedulerStates.get(nodeName) || 'active';
+  }
+
+  public setNodeSchedulerState(
+    nodeName: string,
+    schedulerState: TClusterNodeSchedulerState,
+  ): TClusterNodeSchedulerState {
+    this.nodeSchedulerStates.set(nodeName, schedulerState);
+    this.scheduleControlPersist();
+    return schedulerState;
+  }
+
+  public getDesiredDeployments(): IClusterDesiredDeployment[] {
+    return Array.from(this.desiredDeployments.values()).sort((left, right) =>
+      left.modelId.localeCompare(right.modelId)
+    );
+  }
+
+  public getDesiredDeployment(modelId: string): IClusterDesiredDeployment | null {
+    return this.desiredDeployments.get(modelId) || null;
+  }
+
+  public upsertDesiredDeployment(
+    modelId: string,
+    desiredReplicas: number,
+  ): IClusterDesiredDeployment {
+    const deployment: IClusterDesiredDeployment = {
+      modelId,
+      desiredReplicas,
+      updatedAt: Date.now(),
+    };
+    this.desiredDeployments.set(modelId, deployment);
+    this.scheduleControlPersist();
+    return deployment;
+  }
+
+  public removeDesiredDeployment(modelId: string): boolean {
+    const removed = this.desiredDeployments.delete(modelId);
+    if (removed) {
+      this.scheduleControlPersist();
+    }
+    return removed;
+  }
+
+  public getModelLocations(modelId: string): IClusterModelLocation[] {
+    const locations: IClusterModelLocation[] = [];
+
+    for (const node of this.getHealthyNodes()) {
+      for (const deployment of node.deployments) {
+        if (deployment.modelId !== modelId || !deployment.healthy) {
+          continue;
+        }
+
+        locations.push({
+          modelId,
+          nodeName: node.nodeName,
+          endpoint: deployment.endpoint,
+          healthy: deployment.healthy,
+          engine: deployment.engine,
+          containerId: deployment.containerId,
+        });
+      }
+    }
+
+    return locations;
+  }
+
+  public getActiveModelLocations(modelId: string): IClusterModelLocation[] {
+    return this.getModelLocations(modelId).filter((location) =>
+      this.getNodeSchedulerState(location.nodeName) === 'active'
+    );
+  }
+
+  public resolveModel(modelId: string): IClusterModelLocation | null {
+    const locations = this.getModelLocations(modelId);
+    if (locations.length === 0) {
+      return null;
+    }
+
+    locations.sort((left, right) => {
+      const schedulerPreference = this.compareSchedulerState(
+        this.getNodeSchedulerState(left.nodeName),
+        this.getNodeSchedulerState(right.nodeName),
+      );
+      if (schedulerPreference !== 0) {
+        return schedulerPreference;
+      }
+
+      if (left.nodeName === this.config.nodeName) {
+        return -1;
+      }
+      if (right.nodeName === this.config.nodeName) {
+        return 1;
+      }
+      return left.nodeName.localeCompare(right.nodeName);
+    });
+
+    return locations[0];
+  }
+
+  public pickNodeForModel(
+    model: IModelCatalogEntry,
+    excludedNodeNames: string[] = [],
+  ): IClusterNodeHeartbeat | null {
+    const requiredVram = model.requirements.minVramGb;
+    const minGpuCount = model.requirements.minGpuCount || 1;
+    const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
+
+    const eligible = this.getHealthyNodes().filter((node) => {
+      if (excludedNodeNames.includes(node.nodeName)) {
+        return false;
+      }
+
+      if (node.role === 'standalone' && node.nodeName !== this.config.nodeName) {
+        return false;
+      }
+
+      if (node.schedulerState && node.schedulerState !== 'active') {
+        return false;
+      }
+
+      return node.resources.availableVramGb >= requiredVram &&
+        this.hasEligibleTopologyGroup(node.resources.topologyGroups, requiredVram, minGpuCount);
+    });
+
+    if (eligible.length === 0) {
+      return null;
+    }
+
+    eligible.sort((left, right) => {
+      if (left.nodeName === this.config.nodeName) {
+        return -1;
+      }
+      if (right.nodeName === this.config.nodeName) {
+        return 1;
+      }
+
+      if (right.resources.availableVramGb !== left.resources.availableVramGb) {
+        return right.resources.availableVramGb - left.resources.availableVramGb;
+      }
+
+      const leftTopologyDelta = Math.abs(
+        left.resources.largestGpuGroupCount - preferredTensorParallel,
+      );
+      const rightTopologyDelta = Math.abs(
+        right.resources.largestGpuGroupCount - preferredTensorParallel,
+      );
+      if (leftTopologyDelta !== rightTopologyDelta) {
+        return leftTopologyDelta - rightTopologyDelta;
+      }
+
+      return left.resources.deploymentCount - right.resources.deploymentCount;
+    });
+
+    return eligible[0];
+  }
+
+  public getStatus(): IClusterStatusResponse {
+    const models: Record<string, IClusterModelLocation[]> = {};
+    for (const node of this.getHealthyNodes()) {
+      for (const deployment of node.deployments) {
+        if (!models[deployment.modelId]) {
+          models[deployment.modelId] = [];
+        }
+
+        models[deployment.modelId].push({
+          modelId: deployment.modelId,
+          nodeName: node.nodeName,
+          endpoint: deployment.endpoint,
+          healthy: deployment.healthy,
+          engine: deployment.engine,
+          containerId: deployment.containerId,
+        });
+      }
+    }
+
+    return {
+      localNode: this.localNode ? this.decorateNode(this.localNode) : null,
+      nodes: this.getAllNodes(),
+      models,
+      desiredDeployments: this.getDesiredDeployments(),
+    };
+  }
+
+  private hasEligibleTopologyGroup(
+    groups: IClusterGpuTopologyGroup[],
+    requiredVramGb: number,
+    minGpuCount: number,
+  ): boolean {
+    return groups.some((group) =>
+      group.gpuCount >= minGpuCount && group.totalVramGb >= requiredVramGb
+    );
+  }
+
+  private getStateFilePath(): string {
+    return `${PATHS.DATA_DIR}/cluster-state.json`;
+  }
+
+  private getControlStateFilePath(): string {
+    return `${PATHS.DATA_DIR}/cluster-control-state.json`;
+  }
+
+  private schedulePersist(): void {
+    if (this.persistQueued) {
+      return;
+    }
+
+    this.persistQueued = true;
+    queueMicrotask(() => {
+      this.persistQueued = false;
+      void this.persistState();
+    });
+  }
+
+  private scheduleControlPersist(): void {
+    if (this.controlPersistQueued) {
+      return;
+    }
+
+    this.controlPersistQueued = true;
+    queueMicrotask(() => {
+      this.controlPersistQueued = false;
+      void this.persistControlState();
+    });
+  }
+
+  private async persistState(): Promise<void> {
+    try {
+      await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
+      await fs.writeFile(
+        this.getStateFilePath(),
+        JSON.stringify({ nodes: Array.from(this.knownNodes.values()) }, null, 2),
+      );
+    } catch {
+      // Persistence failure should not break the control plane.
+    }
+  }
+
+  private async persistControlState(): Promise<void> {
+    try {
+      await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
+      await fs.writeFile(
+        this.getControlStateFilePath(),
+        JSON.stringify(
+          {
+            desiredDeployments: this.getDesiredDeployments(),
+            nodeSchedulerStates: Object.fromEntries(this.nodeSchedulerStates.entries()),
+          },
+          null,
+          2,
+        ),
+      );
+    } catch {
+      // Persistence failure should not break the control plane.
+    }
+  }
+
+  private decorateNode(node: IClusterNodeHeartbeat): IClusterNodeHeartbeat {
+    return {
+      ...node,
+      schedulerState: this.getNodeSchedulerState(node.nodeName),
+    };
+  }
+
+  private compareSchedulerState(
+    left: TClusterNodeSchedulerState,
+    right: TClusterNodeSchedulerState,
+  ): number {
+    const order: TClusterNodeSchedulerState[] = ['active', 'cordoned', 'draining'];
+    return order.indexOf(left) - order.indexOf(right);
+  }
+}
@@ -0,0 +1,438 @@
+import type {
+  IClusterDesiredDeployment,
+  IClusterEnsureResponse,
+  IClusterNodeHeartbeat,
+  IClusterNodeResources,
+  IClusterStatusResponse,
+  TClusterNodeSchedulerState,
+} from '../interfaces/cluster.ts';
+import { ContainerManager } from '../containers/container-manager.ts';
+import { GpuDetector } from '../hardware/gpu-detector.ts';
+import { logger } from '../logger.ts';
+import { ModelRegistry } from '../models/registry.ts';
+import { ModelLoader } from '../models/loader.ts';
+import { CLUSTER } from '../constants.ts';
+import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
+import { ClusterManager } from './cluster-manager.ts';
+
+export class ClusterCoordinator {
+  private clusterManager: ClusterManager;
+  private containerManager: ContainerManager;
+  private modelRegistry: ModelRegistry;
+  private modelLoader: ModelLoader;
+  private gpuDetector: GpuDetector;
+
+  constructor(
+    clusterManager: ClusterManager,
+    containerManager: ContainerManager,
+    modelRegistry: ModelRegistry,
+    modelLoader: ModelLoader,
+  ) {
+    this.clusterManager = clusterManager;
+    this.containerManager = containerManager;
+    this.modelRegistry = modelRegistry;
+    this.modelLoader = modelLoader;
+    this.gpuDetector = new GpuDetector();
+  }
+
+  public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
+    const [gpus, statuses, models] = await Promise.all([
+      this.gpuDetector.detectGpus(),
+      this.containerManager.getAllStatus(),
+      this.containerManager.getAllAvailableModels(),
+    ]);
+
+    const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
+    const runningContainers = this.containerManager.getAllContainers().filter((container) => {
+      const status = statuses.get(container.getConfig().id);
+      return status?.running === true;
+    });
+    const resources = await this.buildResourceSummary(
+      gpus,
+      deploymentCount,
+      models,
+      runningContainers,
+    );
+
+    return {
+      nodeName: this.clusterManager.getConfig().nodeName,
+      role: this.clusterManager.getConfig().role,
+      endpoint,
+      healthy: true,
+      resources,
+      deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
+        modelId,
+        engine: 'vllm' as const,
+        endpoint,
+        healthy: endpoints.some((entry) => entry.healthy),
+        containerId: endpoints[0]?.containerId,
+      })),
+      lastSeenAt: Date.now(),
+    };
+  }
+
+  public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
+    const heartbeat = await this.buildLocalHeartbeat(endpoint);
+    this.clusterManager.updateLocalNode(heartbeat);
+    return heartbeat;
+  }
+
+  public async sendHeartbeat(): Promise<void> {
+    if (!this.clusterManager.isEnabled()) {
+      return;
+    }
+
+    const endpoint = this.clusterManager.getAdvertisedEndpoint();
+    const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
+    if (!endpoint || !controlPlaneUrl) {
+      return;
+    }
+
+    if (controlPlaneUrl === endpoint) {
+      return;
+    }
+
+    const heartbeat = await this.syncLocalState(endpoint);
+
+    try {
+      await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
+        method: 'POST',
+        headers: this.buildClusterHeaders(),
+        body: JSON.stringify(heartbeat),
+      });
+    } catch (error) {
+      logger.warn(
+        `Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
+      );
+    }
+  }
+
+  public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
+    this.clusterManager.upsertNode(heartbeat);
+  }
+
+  public getStatus(): IClusterStatusResponse {
+    return this.clusterManager.getStatus();
+  }
+
+  public getDesiredDeployments(): IClusterDesiredDeployment[] {
+    return this.clusterManager.getDesiredDeployments();
+  }
+
+  public getLocalNodeName(): string {
+    return this.clusterManager.getConfig().nodeName;
+  }
+
+  public getSharedSecret(): string | undefined {
+    return this.clusterManager.getSharedSecret();
+  }
+
+  public setNodeSchedulerState(
+    nodeName: string,
+    schedulerState: TClusterNodeSchedulerState,
+  ): TClusterNodeSchedulerState {
+    return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
+  }
+
+  public async setDesiredReplicas(
+    modelName: string,
+    desiredReplicas: number,
+  ): Promise<IClusterDesiredDeployment | null> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (!model) {
+      return null;
+    }
+
+    if (desiredReplicas <= 0) {
+      this.clusterManager.removeDesiredDeployment(model.id);
+      return {
+        modelId: model.id,
+        desiredReplicas: 0,
+        updatedAt: Date.now(),
+      };
+    }
+
+    return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
+  }
+
+  public async clearDesiredDeployment(modelName: string): Promise<boolean> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (!model) {
+      return false;
+    }
+
+    return this.clusterManager.removeDesiredDeployment(model.id);
+  }
+
+  public shouldDeployLocallyFirst(): boolean {
+    if (!this.clusterManager.isEnabled()) {
+      return true;
+    }
+
+    return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
+  }
+
+  public canManageClusterState(): boolean {
+    return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
+  }
+
+  public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (!model) {
+      return null;
+    }
+
+    const location = this.clusterManager.resolveModel(model.id);
+    if (!location) {
+      return null;
+    }
+
+    return {
+      model: model.id,
+      location,
+      created: false,
+    };
+  }
+
+  public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (!model) {
+      return null;
+    }
+
+    this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
+
+    const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
+    if (existing) {
+      return {
+        model: model.id,
+        location: existing,
+        created: false,
+      };
+    }
+
+    if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
+      const local = await this.deployModelLocally(model.id);
+      if (!local) {
+        return null;
+      }
+
+      return local;
+    }
+
+    const targetNode = this.clusterManager.pickNodeForModel(model);
+    if (!targetNode) {
+      return null;
+    }
+
+    if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
+      return this.deployModelLocally(model.id);
+    }
+
+    return this.requestRemoteDeployment(targetNode.endpoint, model.id);
+  }
+
+  public async ensureModelViaControlPlane(
+    modelName: string,
+  ): Promise<IClusterEnsureResponse | null> {
+    const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
+    const localEndpoint = this.clusterManager.getAdvertisedEndpoint();
+
+    if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
+      return this.ensureModel(modelName);
+    }
+
+    try {
+      const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
+        method: 'POST',
+        headers: this.buildClusterHeaders(),
+        body: JSON.stringify({ model: modelName }),
+      });
+
+      if (!response.ok) {
+        return null;
+      }
+
+      return await response.json() as IClusterEnsureResponse;
+    } catch {
+      return null;
+    }
+  }
+
+  public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (model) {
+      this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
+    }
+
+    const result = await this.modelLoader.loadModel(modelName);
+    if (!result.success) {
+      return null;
+    }
+
+    const endpoint = this.clusterManager.getAdvertisedEndpoint();
+    if (endpoint) {
+      await this.syncLocalState(endpoint);
+    }
+
+    const resolved = await this.resolveModel(result.model);
+    if (!resolved) {
+      return null;
+    }
+
+    return {
+      ...resolved,
+      created: !result.alreadyLoaded,
+    };
+  }
+
+  public async reconcileDesiredReplicas(): Promise<void> {
+    if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
+      return;
+    }
+
+    const desiredDeployments = this.clusterManager.getDesiredDeployments();
+    for (const desiredDeployment of desiredDeployments) {
+      if (desiredDeployment.desiredReplicas <= 0) {
+        continue;
+      }
+
+      const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
+      if (!model) {
+        continue;
+      }
+
+      const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
+      const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
+      if (missingReplicas <= 0) {
+        continue;
+      }
+
+      for (let index = 0; index < missingReplicas; index++) {
+        const targetNode = this.clusterManager.pickNodeForModel(model);
+        if (!targetNode) {
+          break;
+        }
+
+        const replicaOrdinal = existingLocations.length + index;
+        const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
+          ? await this.deployReplicaLocally(model.id, replicaOrdinal)
+          : await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);
+
+        if (!result) {
+          break;
+        }
+      }
+    }
+  }
+
+  public async deployReplicaLocally(
+    modelName: string,
+    replicaOrdinal?: number,
+  ): Promise<IClusterEnsureResponse | null> {
+    const model = await this.modelRegistry.getModel(modelName);
+    if (model) {
+      this.rememberDesiredDeployment(
+        model.id,
+        Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
+      );
+    }
+
+    const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
+    if (!result.success) {
+      return null;
+    }
+
+    const endpoint = this.clusterManager.getAdvertisedEndpoint();
+    if (endpoint) {
+      await this.syncLocalState(endpoint);
+    }
+
+    const resolved = await this.resolveModel(result.model);
+    if (!resolved) {
+      return null;
+    }
+
+    return {
+      ...resolved,
+      created: !result.alreadyLoaded,
+    };
+  }
+
+  private async requestRemoteDeployment(
+    nodeEndpoint: string,
+    modelName: string,
+    replicaOrdinal?: number,
+  ): Promise<IClusterEnsureResponse | null> {
+    try {
+      const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
+        method: 'POST',
+        headers: this.buildClusterHeaders(),
+        body: JSON.stringify({ model: modelName, replicaOrdinal }),
+      });
+
+      if (!response.ok) {
+        return null;
+      }
+
+      return await response.json() as IClusterEnsureResponse;
+    } catch {
+      return null;
+    }
+  }
+
+  private async buildResourceSummary(
+    gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
+    deploymentCount: number,
+    _models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
+    runningContainers: ReturnType<ContainerManager['getAllContainers']>,
+  ): Promise<IClusterNodeResources> {
+    const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
+    const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
+    const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
+    const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
+    const availableVramGb = Math.round(
+      availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
+    );
+
+    const maxSingleGpuVramGb = availableGpus.length > 0
+      ? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
+      : 0;
+    const largestGpuGroupCount = topologyGroups.length > 0
+      ? Math.max(...topologyGroups.map((group) => group.gpuCount))
+      : 0;
+    const largestGpuGroupVramGb = topologyGroups.length > 0
+      ? Math.max(...topologyGroups.map((group) => group.totalVramGb))
+      : 0;
+
+    return {
+      gpuCount: gpus.length,
+      totalVramGb,
+      availableVramGb,
+      maxSingleGpuVramGb,
+      largestGpuGroupCount,
+      largestGpuGroupVramGb,
+      deploymentCount,
+      topologyGroups,
+    };
+  }
+
+  private buildClusterHeaders(): Record<string, string> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+    };
+
+    const sharedSecret = this.clusterManager.getSharedSecret();
+    if (sharedSecret) {
+      headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
+    }
+
+    return headers;
+  }
+
+  private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
+    const existing = this.clusterManager.getDesiredDeployment(modelId);
+    const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
+    this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
+  }
+}
@@ -0,0 +1,2 @@
+export { ClusterManager } from './cluster-manager.ts';
+export { ClusterCoordinator } from './coordinator.ts';
@@ -0,0 +1,114 @@
+import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
+import type { IGpuInfo, TGpuVendor } from '../interfaces/gpu.ts';
+import type { IClusterGpuTopologyGroup } from '../interfaces/cluster.ts';
+
+function parsePciBusNumber(gpu: IGpuInfo): number {
+  const source = gpu.pciBusId || gpu.pciSlot;
+  const match = source.match(/(?:[0-9a-f]{4}:)?([0-9a-f]{2}):/i);
+  if (!match) {
+    return gpu.index;
+  }
+
+  return parseInt(match[1], 16);
+}
+
+export function buildGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
+  const sorted = [...gpus].sort((left, right) => {
+    if (left.vendor !== right.vendor) {
+      return left.vendor.localeCompare(right.vendor);
+    }
+
+    return parsePciBusNumber(left) - parsePciBusNumber(right);
+  });
+
+  const groups: IClusterGpuTopologyGroup[] = [];
+
+  for (const gpu of sorted) {
+    const busNumber = parsePciBusNumber(gpu);
+    const previousGroup = groups[groups.length - 1];
+    const previousBus = previousGroup?.busNumbers[previousGroup.busNumbers.length - 1];
+
+    const belongsToPreviousGroup = previousGroup &&
+      previousGroup.vendor === gpu.vendor &&
+      previousBus !== undefined &&
+      busNumber - previousBus <= 1;
+
+    if (belongsToPreviousGroup) {
+      previousGroup.gpuIds.push(gpu.id);
+      previousGroup.busNumbers.push(busNumber);
+      previousGroup.totalVramGb += Math.round(gpu.vram / 1024);
+      previousGroup.maxSingleGpuVramGb = Math.max(
+        previousGroup.maxSingleGpuVramGb,
+        Math.round(gpu.vram / 1024),
+      );
+      continue;
+    }
+
+    groups.push({
+      id: `${gpu.vendor}-${groups.length + 1}`,
+      vendor: gpu.vendor,
+      gpuIds: [gpu.id],
+      gpuCount: 1,
+      totalVramGb: Math.round(gpu.vram / 1024),
+      maxSingleGpuVramGb: Math.round(gpu.vram / 1024),
+      busNumbers: [busNumber],
+    });
+  }
+
+  for (const group of groups) {
+    group.gpuCount = group.gpuIds.length;
+  }
+
+  return groups;
+}
+
+export function summarizeGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
+  return buildGpuTopologyGroups(gpus);
+}
+
+export function selectPlacementForModel(
+  model: IModelCatalogEntry,
+  gpus: IGpuInfo[],
+): { gpuIds: string[]; tensorParallelSize: number; topologyGroupId: string } | null {
+  const minGpuCount = model.requirements.minGpuCount || 1;
+  const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
+  const topologyGroups = buildGpuTopologyGroups(gpus);
+
+  const eligibleGroups = topologyGroups.filter((group) =>
+    group.gpuCount >= minGpuCount && group.totalVramGb >= model.requirements.minVramGb
+  );
+
+  if (eligibleGroups.length === 0) {
+    return null;
+  }
+
+  eligibleGroups.sort((left, right) => {
+    const leftCountDelta = Math.abs(left.gpuCount - preferredTensorParallel);
+    const rightCountDelta = Math.abs(right.gpuCount - preferredTensorParallel);
+    if (leftCountDelta !== rightCountDelta) {
+      return leftCountDelta - rightCountDelta;
+    }
+
+    const leftVramDelta = left.totalVramGb - model.requirements.minVramGb;
+    const rightVramDelta = right.totalVramGb - model.requirements.minVramGb;
+    if (leftVramDelta !== rightVramDelta) {
+      return leftVramDelta - rightVramDelta;
+    }
+
+    return left.id.localeCompare(right.id);
+  });
+
+  const selectedGroup = eligibleGroups[0];
+  const tensorParallelSize = Math.min(preferredTensorParallel, selectedGroup.gpuCount);
+
+  return {
+    gpuIds: selectedGroup.gpuIds.slice(0, tensorParallelSize),
+    tensorParallelSize,
+    topologyGroupId: selectedGroup.id,
+  };
+}
+
+export function filterOutUsedGpus(gpus: IGpuInfo[], usedGpuIds: string[]): IGpuInfo[] {
+  const usedSet = new Set(usedGpuIds);
+  return gpus.filter((gpu) => !usedSet.has(gpu.id));
+}