439 lines
13 KiB
TypeScript
439 lines
13 KiB
TypeScript
import type {
|
|
IClusterDesiredDeployment,
|
|
IClusterEnsureResponse,
|
|
IClusterNodeHeartbeat,
|
|
IClusterNodeResources,
|
|
IClusterStatusResponse,
|
|
TClusterNodeSchedulerState,
|
|
} from '../interfaces/cluster.ts';
|
|
import { ContainerManager } from '../containers/container-manager.ts';
|
|
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
|
import { logger } from '../logger.ts';
|
|
import { ModelRegistry } from '../models/registry.ts';
|
|
import { ModelLoader } from '../models/loader.ts';
|
|
import { CLUSTER } from '../constants.ts';
|
|
import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
|
|
import { ClusterManager } from './cluster-manager.ts';
|
|
|
|
export class ClusterCoordinator {
|
|
private clusterManager: ClusterManager;
|
|
private containerManager: ContainerManager;
|
|
private modelRegistry: ModelRegistry;
|
|
private modelLoader: ModelLoader;
|
|
private gpuDetector: GpuDetector;
|
|
|
|
constructor(
|
|
clusterManager: ClusterManager,
|
|
containerManager: ContainerManager,
|
|
modelRegistry: ModelRegistry,
|
|
modelLoader: ModelLoader,
|
|
) {
|
|
this.clusterManager = clusterManager;
|
|
this.containerManager = containerManager;
|
|
this.modelRegistry = modelRegistry;
|
|
this.modelLoader = modelLoader;
|
|
this.gpuDetector = new GpuDetector();
|
|
}
|
|
|
|
public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
|
const [gpus, statuses, models] = await Promise.all([
|
|
this.gpuDetector.detectGpus(),
|
|
this.containerManager.getAllStatus(),
|
|
this.containerManager.getAllAvailableModels(),
|
|
]);
|
|
|
|
const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
|
|
const runningContainers = this.containerManager.getAllContainers().filter((container) => {
|
|
const status = statuses.get(container.getConfig().id);
|
|
return status?.running === true;
|
|
});
|
|
const resources = await this.buildResourceSummary(
|
|
gpus,
|
|
deploymentCount,
|
|
models,
|
|
runningContainers,
|
|
);
|
|
|
|
return {
|
|
nodeName: this.clusterManager.getConfig().nodeName,
|
|
role: this.clusterManager.getConfig().role,
|
|
endpoint,
|
|
healthy: true,
|
|
resources,
|
|
deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
|
|
modelId,
|
|
engine: 'vllm' as const,
|
|
endpoint,
|
|
healthy: endpoints.some((entry) => entry.healthy),
|
|
containerId: endpoints[0]?.containerId,
|
|
})),
|
|
lastSeenAt: Date.now(),
|
|
};
|
|
}
|
|
|
|
public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
|
const heartbeat = await this.buildLocalHeartbeat(endpoint);
|
|
this.clusterManager.updateLocalNode(heartbeat);
|
|
return heartbeat;
|
|
}
|
|
|
|
public async sendHeartbeat(): Promise<void> {
|
|
if (!this.clusterManager.isEnabled()) {
|
|
return;
|
|
}
|
|
|
|
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
|
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
|
if (!endpoint || !controlPlaneUrl) {
|
|
return;
|
|
}
|
|
|
|
if (controlPlaneUrl === endpoint) {
|
|
return;
|
|
}
|
|
|
|
const heartbeat = await this.syncLocalState(endpoint);
|
|
|
|
try {
|
|
await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
|
|
method: 'POST',
|
|
headers: this.buildClusterHeaders(),
|
|
body: JSON.stringify(heartbeat),
|
|
});
|
|
} catch (error) {
|
|
logger.warn(
|
|
`Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
|
|
this.clusterManager.upsertNode(heartbeat);
|
|
}
|
|
|
|
public getStatus(): IClusterStatusResponse {
|
|
return this.clusterManager.getStatus();
|
|
}
|
|
|
|
public getDesiredDeployments(): IClusterDesiredDeployment[] {
|
|
return this.clusterManager.getDesiredDeployments();
|
|
}
|
|
|
|
public getLocalNodeName(): string {
|
|
return this.clusterManager.getConfig().nodeName;
|
|
}
|
|
|
|
public getSharedSecret(): string | undefined {
|
|
return this.clusterManager.getSharedSecret();
|
|
}
|
|
|
|
public setNodeSchedulerState(
|
|
nodeName: string,
|
|
schedulerState: TClusterNodeSchedulerState,
|
|
): TClusterNodeSchedulerState {
|
|
return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
|
|
}
|
|
|
|
public async setDesiredReplicas(
|
|
modelName: string,
|
|
desiredReplicas: number,
|
|
): Promise<IClusterDesiredDeployment | null> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (!model) {
|
|
return null;
|
|
}
|
|
|
|
if (desiredReplicas <= 0) {
|
|
this.clusterManager.removeDesiredDeployment(model.id);
|
|
return {
|
|
modelId: model.id,
|
|
desiredReplicas: 0,
|
|
updatedAt: Date.now(),
|
|
};
|
|
}
|
|
|
|
return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
|
|
}
|
|
|
|
public async clearDesiredDeployment(modelName: string): Promise<boolean> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (!model) {
|
|
return false;
|
|
}
|
|
|
|
return this.clusterManager.removeDesiredDeployment(model.id);
|
|
}
|
|
|
|
public shouldDeployLocallyFirst(): boolean {
|
|
if (!this.clusterManager.isEnabled()) {
|
|
return true;
|
|
}
|
|
|
|
return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
|
|
}
|
|
|
|
public canManageClusterState(): boolean {
|
|
return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
|
|
}
|
|
|
|
public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (!model) {
|
|
return null;
|
|
}
|
|
|
|
const location = this.clusterManager.resolveModel(model.id);
|
|
if (!location) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
model: model.id,
|
|
location,
|
|
created: false,
|
|
};
|
|
}
|
|
|
|
public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (!model) {
|
|
return null;
|
|
}
|
|
|
|
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
|
|
|
const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
|
|
if (existing) {
|
|
return {
|
|
model: model.id,
|
|
location: existing,
|
|
created: false,
|
|
};
|
|
}
|
|
|
|
if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
|
|
const local = await this.deployModelLocally(model.id);
|
|
if (!local) {
|
|
return null;
|
|
}
|
|
|
|
return local;
|
|
}
|
|
|
|
const targetNode = this.clusterManager.pickNodeForModel(model);
|
|
if (!targetNode) {
|
|
return null;
|
|
}
|
|
|
|
if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
|
|
return this.deployModelLocally(model.id);
|
|
}
|
|
|
|
return this.requestRemoteDeployment(targetNode.endpoint, model.id);
|
|
}
|
|
|
|
public async ensureModelViaControlPlane(
|
|
modelName: string,
|
|
): Promise<IClusterEnsureResponse | null> {
|
|
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
|
const localEndpoint = this.clusterManager.getAdvertisedEndpoint();
|
|
|
|
if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
|
|
return this.ensureModel(modelName);
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
|
|
method: 'POST',
|
|
headers: this.buildClusterHeaders(),
|
|
body: JSON.stringify({ model: modelName }),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
|
|
return await response.json() as IClusterEnsureResponse;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (model) {
|
|
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
|
}
|
|
|
|
const result = await this.modelLoader.loadModel(modelName);
|
|
if (!result.success) {
|
|
return null;
|
|
}
|
|
|
|
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
|
if (endpoint) {
|
|
await this.syncLocalState(endpoint);
|
|
}
|
|
|
|
const resolved = await this.resolveModel(result.model);
|
|
if (!resolved) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
...resolved,
|
|
created: !result.alreadyLoaded,
|
|
};
|
|
}
|
|
|
|
public async reconcileDesiredReplicas(): Promise<void> {
|
|
if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
|
|
return;
|
|
}
|
|
|
|
const desiredDeployments = this.clusterManager.getDesiredDeployments();
|
|
for (const desiredDeployment of desiredDeployments) {
|
|
if (desiredDeployment.desiredReplicas <= 0) {
|
|
continue;
|
|
}
|
|
|
|
const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
|
|
if (!model) {
|
|
continue;
|
|
}
|
|
|
|
const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
|
|
const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
|
|
if (missingReplicas <= 0) {
|
|
continue;
|
|
}
|
|
|
|
for (let index = 0; index < missingReplicas; index++) {
|
|
const targetNode = this.clusterManager.pickNodeForModel(model);
|
|
if (!targetNode) {
|
|
break;
|
|
}
|
|
|
|
const replicaOrdinal = existingLocations.length + index;
|
|
const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
|
|
? await this.deployReplicaLocally(model.id, replicaOrdinal)
|
|
: await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);
|
|
|
|
if (!result) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public async deployReplicaLocally(
|
|
modelName: string,
|
|
replicaOrdinal?: number,
|
|
): Promise<IClusterEnsureResponse | null> {
|
|
const model = await this.modelRegistry.getModel(modelName);
|
|
if (model) {
|
|
this.rememberDesiredDeployment(
|
|
model.id,
|
|
Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
|
|
);
|
|
}
|
|
|
|
const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
|
|
if (!result.success) {
|
|
return null;
|
|
}
|
|
|
|
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
|
if (endpoint) {
|
|
await this.syncLocalState(endpoint);
|
|
}
|
|
|
|
const resolved = await this.resolveModel(result.model);
|
|
if (!resolved) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
...resolved,
|
|
created: !result.alreadyLoaded,
|
|
};
|
|
}
|
|
|
|
private async requestRemoteDeployment(
|
|
nodeEndpoint: string,
|
|
modelName: string,
|
|
replicaOrdinal?: number,
|
|
): Promise<IClusterEnsureResponse | null> {
|
|
try {
|
|
const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
|
|
method: 'POST',
|
|
headers: this.buildClusterHeaders(),
|
|
body: JSON.stringify({ model: modelName, replicaOrdinal }),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
|
|
return await response.json() as IClusterEnsureResponse;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private async buildResourceSummary(
|
|
gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
|
|
deploymentCount: number,
|
|
_models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
|
|
runningContainers: ReturnType<ContainerManager['getAllContainers']>,
|
|
): Promise<IClusterNodeResources> {
|
|
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
|
|
const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
|
|
const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
|
|
const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
|
|
const availableVramGb = Math.round(
|
|
availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
|
|
);
|
|
|
|
const maxSingleGpuVramGb = availableGpus.length > 0
|
|
? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
|
|
: 0;
|
|
const largestGpuGroupCount = topologyGroups.length > 0
|
|
? Math.max(...topologyGroups.map((group) => group.gpuCount))
|
|
: 0;
|
|
const largestGpuGroupVramGb = topologyGroups.length > 0
|
|
? Math.max(...topologyGroups.map((group) => group.totalVramGb))
|
|
: 0;
|
|
|
|
return {
|
|
gpuCount: gpus.length,
|
|
totalVramGb,
|
|
availableVramGb,
|
|
maxSingleGpuVramGb,
|
|
largestGpuGroupCount,
|
|
largestGpuGroupVramGb,
|
|
deploymentCount,
|
|
topologyGroups,
|
|
};
|
|
}
|
|
|
|
private buildClusterHeaders(): Record<string, string> {
|
|
const headers: Record<string, string> = {
|
|
'Content-Type': 'application/json',
|
|
};
|
|
|
|
const sharedSecret = this.clusterManager.getSharedSecret();
|
|
if (sharedSecret) {
|
|
headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
|
|
}
|
|
|
|
return headers;
|
|
}
|
|
|
|
private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
|
|
const existing = this.clusterManager.getDesiredDeployment(modelId);
|
|
const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
|
|
this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
|
|
}
|
|
}
|