feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -0,0 +1,438 @@
|
||||
import type {
|
||||
IClusterDesiredDeployment,
|
||||
IClusterEnsureResponse,
|
||||
IClusterNodeHeartbeat,
|
||||
IClusterNodeResources,
|
||||
IClusterStatusResponse,
|
||||
TClusterNodeSchedulerState,
|
||||
} from '../interfaces/cluster.ts';
|
||||
import { ContainerManager } from '../containers/container-manager.ts';
|
||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { ModelRegistry } from '../models/registry.ts';
|
||||
import { ModelLoader } from '../models/loader.ts';
|
||||
import { CLUSTER } from '../constants.ts';
|
||||
import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
|
||||
import { ClusterManager } from './cluster-manager.ts';
|
||||
|
||||
export class ClusterCoordinator {
|
||||
private clusterManager: ClusterManager;
|
||||
private containerManager: ContainerManager;
|
||||
private modelRegistry: ModelRegistry;
|
||||
private modelLoader: ModelLoader;
|
||||
private gpuDetector: GpuDetector;
|
||||
|
||||
constructor(
|
||||
clusterManager: ClusterManager,
|
||||
containerManager: ContainerManager,
|
||||
modelRegistry: ModelRegistry,
|
||||
modelLoader: ModelLoader,
|
||||
) {
|
||||
this.clusterManager = clusterManager;
|
||||
this.containerManager = containerManager;
|
||||
this.modelRegistry = modelRegistry;
|
||||
this.modelLoader = modelLoader;
|
||||
this.gpuDetector = new GpuDetector();
|
||||
}
|
||||
|
||||
public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
||||
const [gpus, statuses, models] = await Promise.all([
|
||||
this.gpuDetector.detectGpus(),
|
||||
this.containerManager.getAllStatus(),
|
||||
this.containerManager.getAllAvailableModels(),
|
||||
]);
|
||||
|
||||
const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
|
||||
const runningContainers = this.containerManager.getAllContainers().filter((container) => {
|
||||
const status = statuses.get(container.getConfig().id);
|
||||
return status?.running === true;
|
||||
});
|
||||
const resources = await this.buildResourceSummary(
|
||||
gpus,
|
||||
deploymentCount,
|
||||
models,
|
||||
runningContainers,
|
||||
);
|
||||
|
||||
return {
|
||||
nodeName: this.clusterManager.getConfig().nodeName,
|
||||
role: this.clusterManager.getConfig().role,
|
||||
endpoint,
|
||||
healthy: true,
|
||||
resources,
|
||||
deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
|
||||
modelId,
|
||||
engine: 'vllm' as const,
|
||||
endpoint,
|
||||
healthy: endpoints.some((entry) => entry.healthy),
|
||||
containerId: endpoints[0]?.containerId,
|
||||
})),
|
||||
lastSeenAt: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
||||
const heartbeat = await this.buildLocalHeartbeat(endpoint);
|
||||
this.clusterManager.updateLocalNode(heartbeat);
|
||||
return heartbeat;
|
||||
}
|
||||
|
||||
public async sendHeartbeat(): Promise<void> {
|
||||
if (!this.clusterManager.isEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
||||
if (!endpoint || !controlPlaneUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (controlPlaneUrl === endpoint) {
|
||||
return;
|
||||
}
|
||||
|
||||
const heartbeat = await this.syncLocalState(endpoint);
|
||||
|
||||
try {
|
||||
await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify(heartbeat),
|
||||
});
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
`Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
|
||||
this.clusterManager.upsertNode(heartbeat);
|
||||
}
|
||||
|
||||
public getStatus(): IClusterStatusResponse {
|
||||
return this.clusterManager.getStatus();
|
||||
}
|
||||
|
||||
public getDesiredDeployments(): IClusterDesiredDeployment[] {
|
||||
return this.clusterManager.getDesiredDeployments();
|
||||
}
|
||||
|
||||
public getLocalNodeName(): string {
|
||||
return this.clusterManager.getConfig().nodeName;
|
||||
}
|
||||
|
||||
public getSharedSecret(): string | undefined {
|
||||
return this.clusterManager.getSharedSecret();
|
||||
}
|
||||
|
||||
public setNodeSchedulerState(
|
||||
nodeName: string,
|
||||
schedulerState: TClusterNodeSchedulerState,
|
||||
): TClusterNodeSchedulerState {
|
||||
return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
|
||||
}
|
||||
|
||||
public async setDesiredReplicas(
|
||||
modelName: string,
|
||||
desiredReplicas: number,
|
||||
): Promise<IClusterDesiredDeployment | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (desiredReplicas <= 0) {
|
||||
this.clusterManager.removeDesiredDeployment(model.id);
|
||||
return {
|
||||
modelId: model.id,
|
||||
desiredReplicas: 0,
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
|
||||
}
|
||||
|
||||
public async clearDesiredDeployment(modelName: string): Promise<boolean> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return this.clusterManager.removeDesiredDeployment(model.id);
|
||||
}
|
||||
|
||||
public shouldDeployLocallyFirst(): boolean {
|
||||
if (!this.clusterManager.isEnabled()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
|
||||
}
|
||||
|
||||
public canManageClusterState(): boolean {
|
||||
return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
|
||||
}
|
||||
|
||||
public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const location = this.clusterManager.resolveModel(model.id);
|
||||
if (!location) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
model: model.id,
|
||||
location,
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
||||
|
||||
const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
|
||||
if (existing) {
|
||||
return {
|
||||
model: model.id,
|
||||
location: existing,
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
|
||||
const local = await this.deployModelLocally(model.id);
|
||||
if (!local) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return local;
|
||||
}
|
||||
|
||||
const targetNode = this.clusterManager.pickNodeForModel(model);
|
||||
if (!targetNode) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
|
||||
return this.deployModelLocally(model.id);
|
||||
}
|
||||
|
||||
return this.requestRemoteDeployment(targetNode.endpoint, model.id);
|
||||
}
|
||||
|
||||
public async ensureModelViaControlPlane(
|
||||
modelName: string,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
||||
const localEndpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
|
||||
if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
|
||||
return this.ensureModel(modelName);
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify({ model: modelName }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json() as IClusterEnsureResponse;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (model) {
|
||||
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
||||
}
|
||||
|
||||
const result = await this.modelLoader.loadModel(modelName);
|
||||
if (!result.success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
if (endpoint) {
|
||||
await this.syncLocalState(endpoint);
|
||||
}
|
||||
|
||||
const resolved = await this.resolveModel(result.model);
|
||||
if (!resolved) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
...resolved,
|
||||
created: !result.alreadyLoaded,
|
||||
};
|
||||
}
|
||||
|
||||
public async reconcileDesiredReplicas(): Promise<void> {
|
||||
if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const desiredDeployments = this.clusterManager.getDesiredDeployments();
|
||||
for (const desiredDeployment of desiredDeployments) {
|
||||
if (desiredDeployment.desiredReplicas <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
|
||||
if (!model) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
|
||||
const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
|
||||
if (missingReplicas <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let index = 0; index < missingReplicas; index++) {
|
||||
const targetNode = this.clusterManager.pickNodeForModel(model);
|
||||
if (!targetNode) {
|
||||
break;
|
||||
}
|
||||
|
||||
const replicaOrdinal = existingLocations.length + index;
|
||||
const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
|
||||
? await this.deployReplicaLocally(model.id, replicaOrdinal)
|
||||
: await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);
|
||||
|
||||
if (!result) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public async deployReplicaLocally(
|
||||
modelName: string,
|
||||
replicaOrdinal?: number,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (model) {
|
||||
this.rememberDesiredDeployment(
|
||||
model.id,
|
||||
Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
|
||||
);
|
||||
}
|
||||
|
||||
const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
|
||||
if (!result.success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
if (endpoint) {
|
||||
await this.syncLocalState(endpoint);
|
||||
}
|
||||
|
||||
const resolved = await this.resolveModel(result.model);
|
||||
if (!resolved) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
...resolved,
|
||||
created: !result.alreadyLoaded,
|
||||
};
|
||||
}
|
||||
|
||||
private async requestRemoteDeployment(
|
||||
nodeEndpoint: string,
|
||||
modelName: string,
|
||||
replicaOrdinal?: number,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
try {
|
||||
const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify({ model: modelName, replicaOrdinal }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json() as IClusterEnsureResponse;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async buildResourceSummary(
|
||||
gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
|
||||
deploymentCount: number,
|
||||
_models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
|
||||
runningContainers: ReturnType<ContainerManager['getAllContainers']>,
|
||||
): Promise<IClusterNodeResources> {
|
||||
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
|
||||
const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
|
||||
const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
|
||||
const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
|
||||
const availableVramGb = Math.round(
|
||||
availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
|
||||
);
|
||||
|
||||
const maxSingleGpuVramGb = availableGpus.length > 0
|
||||
? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
|
||||
: 0;
|
||||
const largestGpuGroupCount = topologyGroups.length > 0
|
||||
? Math.max(...topologyGroups.map((group) => group.gpuCount))
|
||||
: 0;
|
||||
const largestGpuGroupVramGb = topologyGroups.length > 0
|
||||
? Math.max(...topologyGroups.map((group) => group.totalVramGb))
|
||||
: 0;
|
||||
|
||||
return {
|
||||
gpuCount: gpus.length,
|
||||
totalVramGb,
|
||||
availableVramGb,
|
||||
maxSingleGpuVramGb,
|
||||
largestGpuGroupCount,
|
||||
largestGpuGroupVramGb,
|
||||
deploymentCount,
|
||||
topologyGroups,
|
||||
};
|
||||
}
|
||||
|
||||
private buildClusterHeaders(): Record<string, string> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
const sharedSecret = this.clusterManager.getSharedSecret();
|
||||
if (sharedSecret) {
|
||||
headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
|
||||
}
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
|
||||
const existing = this.clusterManager.getDesiredDeployment(modelId);
|
||||
const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
|
||||
this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user