feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -0,0 +1,456 @@
|
||||
import os from 'node:os';
|
||||
import * as fs from 'node:fs/promises';
|
||||
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
|
||||
import type {
|
||||
IClusterConfig,
|
||||
IClusterDesiredDeployment,
|
||||
IClusterGpuTopologyGroup,
|
||||
IClusterModelLocation,
|
||||
IClusterNodeHeartbeat,
|
||||
IClusterNodeStatus,
|
||||
IClusterStatusResponse,
|
||||
TClusterNodeSchedulerState,
|
||||
} from '../interfaces/cluster.ts';
|
||||
import { CLUSTER, PATHS } from '../constants.ts';
|
||||
|
||||
export class ClusterManager {
|
||||
private config: IClusterConfig = {
|
||||
enabled: false,
|
||||
nodeName: os.hostname(),
|
||||
role: 'standalone',
|
||||
bindHost: CLUSTER.DEFAULT_BIND_HOST,
|
||||
gossipPort: CLUSTER.DEFAULT_GOSSIP_PORT,
|
||||
heartbeatIntervalMs: CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
|
||||
seedNodes: [],
|
||||
};
|
||||
private localNode: IClusterNodeHeartbeat | null = null;
|
||||
private knownNodes = new Map<string, IClusterNodeHeartbeat>();
|
||||
private desiredDeployments = new Map<string, IClusterDesiredDeployment>();
|
||||
private nodeSchedulerStates = new Map<string, TClusterNodeSchedulerState>();
|
||||
private persistQueued = false;
|
||||
private controlPersistQueued = false;
|
||||
|
||||
public async initialize(): Promise<void> {
|
||||
try {
|
||||
const stateContent = await fs.readFile(this.getStateFilePath(), 'utf-8');
|
||||
const data = JSON.parse(stateContent) as { nodes?: IClusterNodeHeartbeat[] };
|
||||
|
||||
for (const node of data.nodes || []) {
|
||||
this.knownNodes.set(node.nodeName, node);
|
||||
if (node.nodeName === this.config.nodeName) {
|
||||
this.localNode = node;
|
||||
}
|
||||
}
|
||||
|
||||
this.pruneStaleNodes();
|
||||
} catch {
|
||||
// No persisted cluster state yet.
|
||||
}
|
||||
|
||||
try {
|
||||
const controlStateContent = await fs.readFile(this.getControlStateFilePath(), 'utf-8');
|
||||
const data = JSON.parse(controlStateContent) as {
|
||||
desiredDeployments?: IClusterDesiredDeployment[];
|
||||
nodeSchedulerStates?: Record<string, TClusterNodeSchedulerState>;
|
||||
};
|
||||
|
||||
for (const deployment of data.desiredDeployments || []) {
|
||||
this.desiredDeployments.set(deployment.modelId, deployment);
|
||||
}
|
||||
|
||||
for (const [nodeName, schedulerState] of Object.entries(data.nodeSchedulerStates || {})) {
|
||||
this.nodeSchedulerStates.set(nodeName, schedulerState);
|
||||
}
|
||||
} catch {
|
||||
// No persisted control state yet.
|
||||
}
|
||||
}
|
||||
|
||||
public configure(config: IClusterConfig): void {
|
||||
this.config = {
|
||||
...config,
|
||||
heartbeatIntervalMs: config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
|
||||
seedNodes: config.seedNodes || [],
|
||||
};
|
||||
}
|
||||
|
||||
public getConfig(): IClusterConfig {
|
||||
return this.config;
|
||||
}
|
||||
|
||||
public isEnabled(): boolean {
|
||||
return this.config.enabled;
|
||||
}
|
||||
|
||||
public isControlPlane(): boolean {
|
||||
return this.config.enabled && this.config.role === 'control-plane';
|
||||
}
|
||||
|
||||
public isWorker(): boolean {
|
||||
return this.config.enabled && this.config.role === 'worker';
|
||||
}
|
||||
|
||||
public getModeLabel(): string {
|
||||
if (!this.config.enabled) {
|
||||
return 'standalone';
|
||||
}
|
||||
|
||||
return this.config.role;
|
||||
}
|
||||
|
||||
public getHeartbeatIntervalMs(): number {
|
||||
return this.config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS;
|
||||
}
|
||||
|
||||
public getAdvertisedEndpoint(): string | undefined {
|
||||
return this.localNode?.endpoint || this.config.advertiseUrl;
|
||||
}
|
||||
|
||||
public getControlPlaneUrl(): string | undefined {
|
||||
return this.config.controlPlaneUrl;
|
||||
}
|
||||
|
||||
public getSharedSecret(): string | undefined {
|
||||
return this.config.sharedSecret || undefined;
|
||||
}
|
||||
|
||||
public updateLocalNode(heartbeat: IClusterNodeHeartbeat): void {
|
||||
this.localNode = heartbeat;
|
||||
this.knownNodes.set(heartbeat.nodeName, heartbeat);
|
||||
this.schedulePersist();
|
||||
}
|
||||
|
||||
public upsertNode(heartbeat: IClusterNodeHeartbeat): void {
|
||||
this.knownNodes.set(heartbeat.nodeName, heartbeat);
|
||||
this.schedulePersist();
|
||||
}
|
||||
|
||||
public getLocalNodeStatus(): IClusterNodeStatus {
|
||||
return {
|
||||
nodeName: this.config.nodeName,
|
||||
role: this.config.role,
|
||||
endpoint: this.getAdvertisedEndpoint(),
|
||||
healthy: true,
|
||||
schedulerState: this.getNodeSchedulerState(this.config.nodeName),
|
||||
};
|
||||
}
|
||||
|
||||
public getLocalNode(): IClusterNodeHeartbeat | null {
|
||||
return this.localNode;
|
||||
}
|
||||
|
||||
public getNode(nodeName: string): IClusterNodeHeartbeat | null {
|
||||
const node = this.knownNodes.get(nodeName);
|
||||
if (!node) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return this.decorateNode(node);
|
||||
}
|
||||
|
||||
public pruneStaleNodes(): void {
|
||||
const now = Date.now();
|
||||
for (const [nodeName, node] of this.knownNodes) {
|
||||
if (nodeName === this.config.nodeName) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (now - node.lastSeenAt > CLUSTER.NODE_STALE_AFTER_MS) {
|
||||
this.knownNodes.delete(nodeName);
|
||||
this.schedulePersist();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public getAllNodes(): IClusterNodeHeartbeat[] {
|
||||
this.pruneStaleNodes();
|
||||
return Array.from(this.knownNodes.values()).map((node) => this.decorateNode(node)).sort(
|
||||
(left, right) => {
|
||||
if (left.nodeName === this.config.nodeName) {
|
||||
return -1;
|
||||
}
|
||||
if (right.nodeName === this.config.nodeName) {
|
||||
return 1;
|
||||
}
|
||||
return left.nodeName.localeCompare(right.nodeName);
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
public getHealthyNodes(): IClusterNodeHeartbeat[] {
|
||||
return this.getAllNodes().filter((node) => node.healthy);
|
||||
}
|
||||
|
||||
public getNodeSchedulerState(nodeName: string): TClusterNodeSchedulerState {
|
||||
return this.nodeSchedulerStates.get(nodeName) || 'active';
|
||||
}
|
||||
|
||||
public setNodeSchedulerState(
|
||||
nodeName: string,
|
||||
schedulerState: TClusterNodeSchedulerState,
|
||||
): TClusterNodeSchedulerState {
|
||||
this.nodeSchedulerStates.set(nodeName, schedulerState);
|
||||
this.scheduleControlPersist();
|
||||
return schedulerState;
|
||||
}
|
||||
|
||||
public getDesiredDeployments(): IClusterDesiredDeployment[] {
|
||||
return Array.from(this.desiredDeployments.values()).sort((left, right) =>
|
||||
left.modelId.localeCompare(right.modelId)
|
||||
);
|
||||
}
|
||||
|
||||
public getDesiredDeployment(modelId: string): IClusterDesiredDeployment | null {
|
||||
return this.desiredDeployments.get(modelId) || null;
|
||||
}
|
||||
|
||||
public upsertDesiredDeployment(
|
||||
modelId: string,
|
||||
desiredReplicas: number,
|
||||
): IClusterDesiredDeployment {
|
||||
const deployment: IClusterDesiredDeployment = {
|
||||
modelId,
|
||||
desiredReplicas,
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
this.desiredDeployments.set(modelId, deployment);
|
||||
this.scheduleControlPersist();
|
||||
return deployment;
|
||||
}
|
||||
|
||||
public removeDesiredDeployment(modelId: string): boolean {
|
||||
const removed = this.desiredDeployments.delete(modelId);
|
||||
if (removed) {
|
||||
this.scheduleControlPersist();
|
||||
}
|
||||
return removed;
|
||||
}
|
||||
|
||||
public getModelLocations(modelId: string): IClusterModelLocation[] {
|
||||
const locations: IClusterModelLocation[] = [];
|
||||
|
||||
for (const node of this.getHealthyNodes()) {
|
||||
for (const deployment of node.deployments) {
|
||||
if (deployment.modelId !== modelId || !deployment.healthy) {
|
||||
continue;
|
||||
}
|
||||
|
||||
locations.push({
|
||||
modelId,
|
||||
nodeName: node.nodeName,
|
||||
endpoint: deployment.endpoint,
|
||||
healthy: deployment.healthy,
|
||||
engine: deployment.engine,
|
||||
containerId: deployment.containerId,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return locations;
|
||||
}
|
||||
|
||||
public getActiveModelLocations(modelId: string): IClusterModelLocation[] {
|
||||
return this.getModelLocations(modelId).filter((location) =>
|
||||
this.getNodeSchedulerState(location.nodeName) === 'active'
|
||||
);
|
||||
}
|
||||
|
||||
public resolveModel(modelId: string): IClusterModelLocation | null {
|
||||
const locations = this.getModelLocations(modelId);
|
||||
if (locations.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
locations.sort((left, right) => {
|
||||
const schedulerPreference = this.compareSchedulerState(
|
||||
this.getNodeSchedulerState(left.nodeName),
|
||||
this.getNodeSchedulerState(right.nodeName),
|
||||
);
|
||||
if (schedulerPreference !== 0) {
|
||||
return schedulerPreference;
|
||||
}
|
||||
|
||||
if (left.nodeName === this.config.nodeName) {
|
||||
return -1;
|
||||
}
|
||||
if (right.nodeName === this.config.nodeName) {
|
||||
return 1;
|
||||
}
|
||||
return left.nodeName.localeCompare(right.nodeName);
|
||||
});
|
||||
|
||||
return locations[0];
|
||||
}
|
||||
|
||||
public pickNodeForModel(
|
||||
model: IModelCatalogEntry,
|
||||
excludedNodeNames: string[] = [],
|
||||
): IClusterNodeHeartbeat | null {
|
||||
const requiredVram = model.requirements.minVramGb;
|
||||
const minGpuCount = model.requirements.minGpuCount || 1;
|
||||
const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
|
||||
|
||||
const eligible = this.getHealthyNodes().filter((node) => {
|
||||
if (excludedNodeNames.includes(node.nodeName)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node.role === 'standalone' && node.nodeName !== this.config.nodeName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node.schedulerState && node.schedulerState !== 'active') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return node.resources.availableVramGb >= requiredVram &&
|
||||
this.hasEligibleTopologyGroup(node.resources.topologyGroups, requiredVram, minGpuCount);
|
||||
});
|
||||
|
||||
if (eligible.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
eligible.sort((left, right) => {
|
||||
if (left.nodeName === this.config.nodeName) {
|
||||
return -1;
|
||||
}
|
||||
if (right.nodeName === this.config.nodeName) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (right.resources.availableVramGb !== left.resources.availableVramGb) {
|
||||
return right.resources.availableVramGb - left.resources.availableVramGb;
|
||||
}
|
||||
|
||||
const leftTopologyDelta = Math.abs(
|
||||
left.resources.largestGpuGroupCount - preferredTensorParallel,
|
||||
);
|
||||
const rightTopologyDelta = Math.abs(
|
||||
right.resources.largestGpuGroupCount - preferredTensorParallel,
|
||||
);
|
||||
if (leftTopologyDelta !== rightTopologyDelta) {
|
||||
return leftTopologyDelta - rightTopologyDelta;
|
||||
}
|
||||
|
||||
return left.resources.deploymentCount - right.resources.deploymentCount;
|
||||
});
|
||||
|
||||
return eligible[0];
|
||||
}
|
||||
|
||||
public getStatus(): IClusterStatusResponse {
|
||||
const models: Record<string, IClusterModelLocation[]> = {};
|
||||
for (const node of this.getHealthyNodes()) {
|
||||
for (const deployment of node.deployments) {
|
||||
if (!models[deployment.modelId]) {
|
||||
models[deployment.modelId] = [];
|
||||
}
|
||||
|
||||
models[deployment.modelId].push({
|
||||
modelId: deployment.modelId,
|
||||
nodeName: node.nodeName,
|
||||
endpoint: deployment.endpoint,
|
||||
healthy: deployment.healthy,
|
||||
engine: deployment.engine,
|
||||
containerId: deployment.containerId,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
localNode: this.localNode ? this.decorateNode(this.localNode) : null,
|
||||
nodes: this.getAllNodes(),
|
||||
models,
|
||||
desiredDeployments: this.getDesiredDeployments(),
|
||||
};
|
||||
}
|
||||
|
||||
private hasEligibleTopologyGroup(
|
||||
groups: IClusterGpuTopologyGroup[],
|
||||
requiredVramGb: number,
|
||||
minGpuCount: number,
|
||||
): boolean {
|
||||
return groups.some((group) =>
|
||||
group.gpuCount >= minGpuCount && group.totalVramGb >= requiredVramGb
|
||||
);
|
||||
}
|
||||
|
||||
private getStateFilePath(): string {
|
||||
return `${PATHS.DATA_DIR}/cluster-state.json`;
|
||||
}
|
||||
|
||||
private getControlStateFilePath(): string {
|
||||
return `${PATHS.DATA_DIR}/cluster-control-state.json`;
|
||||
}
|
||||
|
||||
private schedulePersist(): void {
|
||||
if (this.persistQueued) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.persistQueued = true;
|
||||
queueMicrotask(() => {
|
||||
this.persistQueued = false;
|
||||
void this.persistState();
|
||||
});
|
||||
}
|
||||
|
||||
private scheduleControlPersist(): void {
|
||||
if (this.controlPersistQueued) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.controlPersistQueued = true;
|
||||
queueMicrotask(() => {
|
||||
this.controlPersistQueued = false;
|
||||
void this.persistControlState();
|
||||
});
|
||||
}
|
||||
|
||||
private async persistState(): Promise<void> {
|
||||
try {
|
||||
await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
|
||||
await fs.writeFile(
|
||||
this.getStateFilePath(),
|
||||
JSON.stringify({ nodes: Array.from(this.knownNodes.values()) }, null, 2),
|
||||
);
|
||||
} catch {
|
||||
// Persistence failure should not break the control plane.
|
||||
}
|
||||
}
|
||||
|
||||
private async persistControlState(): Promise<void> {
|
||||
try {
|
||||
await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
|
||||
await fs.writeFile(
|
||||
this.getControlStateFilePath(),
|
||||
JSON.stringify(
|
||||
{
|
||||
desiredDeployments: this.getDesiredDeployments(),
|
||||
nodeSchedulerStates: Object.fromEntries(this.nodeSchedulerStates.entries()),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
} catch {
|
||||
// Persistence failure should not break the control plane.
|
||||
}
|
||||
}
|
||||
|
||||
private decorateNode(node: IClusterNodeHeartbeat): IClusterNodeHeartbeat {
|
||||
return {
|
||||
...node,
|
||||
schedulerState: this.getNodeSchedulerState(node.nodeName),
|
||||
};
|
||||
}
|
||||
|
||||
private compareSchedulerState(
|
||||
left: TClusterNodeSchedulerState,
|
||||
right: TClusterNodeSchedulerState,
|
||||
): number {
|
||||
const order: TClusterNodeSchedulerState[] = ['active', 'cordoned', 'draining'];
|
||||
return order.indexOf(left) - order.indexOf(right);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,438 @@
|
||||
import type {
|
||||
IClusterDesiredDeployment,
|
||||
IClusterEnsureResponse,
|
||||
IClusterNodeHeartbeat,
|
||||
IClusterNodeResources,
|
||||
IClusterStatusResponse,
|
||||
TClusterNodeSchedulerState,
|
||||
} from '../interfaces/cluster.ts';
|
||||
import { ContainerManager } from '../containers/container-manager.ts';
|
||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { ModelRegistry } from '../models/registry.ts';
|
||||
import { ModelLoader } from '../models/loader.ts';
|
||||
import { CLUSTER } from '../constants.ts';
|
||||
import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
|
||||
import { ClusterManager } from './cluster-manager.ts';
|
||||
|
||||
export class ClusterCoordinator {
|
||||
private clusterManager: ClusterManager;
|
||||
private containerManager: ContainerManager;
|
||||
private modelRegistry: ModelRegistry;
|
||||
private modelLoader: ModelLoader;
|
||||
private gpuDetector: GpuDetector;
|
||||
|
||||
constructor(
|
||||
clusterManager: ClusterManager,
|
||||
containerManager: ContainerManager,
|
||||
modelRegistry: ModelRegistry,
|
||||
modelLoader: ModelLoader,
|
||||
) {
|
||||
this.clusterManager = clusterManager;
|
||||
this.containerManager = containerManager;
|
||||
this.modelRegistry = modelRegistry;
|
||||
this.modelLoader = modelLoader;
|
||||
this.gpuDetector = new GpuDetector();
|
||||
}
|
||||
|
||||
public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
||||
const [gpus, statuses, models] = await Promise.all([
|
||||
this.gpuDetector.detectGpus(),
|
||||
this.containerManager.getAllStatus(),
|
||||
this.containerManager.getAllAvailableModels(),
|
||||
]);
|
||||
|
||||
const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
|
||||
const runningContainers = this.containerManager.getAllContainers().filter((container) => {
|
||||
const status = statuses.get(container.getConfig().id);
|
||||
return status?.running === true;
|
||||
});
|
||||
const resources = await this.buildResourceSummary(
|
||||
gpus,
|
||||
deploymentCount,
|
||||
models,
|
||||
runningContainers,
|
||||
);
|
||||
|
||||
return {
|
||||
nodeName: this.clusterManager.getConfig().nodeName,
|
||||
role: this.clusterManager.getConfig().role,
|
||||
endpoint,
|
||||
healthy: true,
|
||||
resources,
|
||||
deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
|
||||
modelId,
|
||||
engine: 'vllm' as const,
|
||||
endpoint,
|
||||
healthy: endpoints.some((entry) => entry.healthy),
|
||||
containerId: endpoints[0]?.containerId,
|
||||
})),
|
||||
lastSeenAt: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
|
||||
const heartbeat = await this.buildLocalHeartbeat(endpoint);
|
||||
this.clusterManager.updateLocalNode(heartbeat);
|
||||
return heartbeat;
|
||||
}
|
||||
|
||||
public async sendHeartbeat(): Promise<void> {
|
||||
if (!this.clusterManager.isEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
||||
if (!endpoint || !controlPlaneUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (controlPlaneUrl === endpoint) {
|
||||
return;
|
||||
}
|
||||
|
||||
const heartbeat = await this.syncLocalState(endpoint);
|
||||
|
||||
try {
|
||||
await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify(heartbeat),
|
||||
});
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
`Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
|
||||
this.clusterManager.upsertNode(heartbeat);
|
||||
}
|
||||
|
||||
public getStatus(): IClusterStatusResponse {
|
||||
return this.clusterManager.getStatus();
|
||||
}
|
||||
|
||||
public getDesiredDeployments(): IClusterDesiredDeployment[] {
|
||||
return this.clusterManager.getDesiredDeployments();
|
||||
}
|
||||
|
||||
public getLocalNodeName(): string {
|
||||
return this.clusterManager.getConfig().nodeName;
|
||||
}
|
||||
|
||||
public getSharedSecret(): string | undefined {
|
||||
return this.clusterManager.getSharedSecret();
|
||||
}
|
||||
|
||||
public setNodeSchedulerState(
|
||||
nodeName: string,
|
||||
schedulerState: TClusterNodeSchedulerState,
|
||||
): TClusterNodeSchedulerState {
|
||||
return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
|
||||
}
|
||||
|
||||
public async setDesiredReplicas(
|
||||
modelName: string,
|
||||
desiredReplicas: number,
|
||||
): Promise<IClusterDesiredDeployment | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (desiredReplicas <= 0) {
|
||||
this.clusterManager.removeDesiredDeployment(model.id);
|
||||
return {
|
||||
modelId: model.id,
|
||||
desiredReplicas: 0,
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
|
||||
}
|
||||
|
||||
public async clearDesiredDeployment(modelName: string): Promise<boolean> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return this.clusterManager.removeDesiredDeployment(model.id);
|
||||
}
|
||||
|
||||
public shouldDeployLocallyFirst(): boolean {
|
||||
if (!this.clusterManager.isEnabled()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
|
||||
}
|
||||
|
||||
public canManageClusterState(): boolean {
|
||||
return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
|
||||
}
|
||||
|
||||
public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const location = this.clusterManager.resolveModel(model.id);
|
||||
if (!location) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
model: model.id,
|
||||
location,
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
|
||||
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
||||
|
||||
const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
|
||||
if (existing) {
|
||||
return {
|
||||
model: model.id,
|
||||
location: existing,
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
|
||||
const local = await this.deployModelLocally(model.id);
|
||||
if (!local) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return local;
|
||||
}
|
||||
|
||||
const targetNode = this.clusterManager.pickNodeForModel(model);
|
||||
if (!targetNode) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
|
||||
return this.deployModelLocally(model.id);
|
||||
}
|
||||
|
||||
return this.requestRemoteDeployment(targetNode.endpoint, model.id);
|
||||
}
|
||||
|
||||
public async ensureModelViaControlPlane(
|
||||
modelName: string,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
|
||||
const localEndpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
|
||||
if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
|
||||
return this.ensureModel(modelName);
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify({ model: modelName }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json() as IClusterEnsureResponse;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (model) {
|
||||
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
|
||||
}
|
||||
|
||||
const result = await this.modelLoader.loadModel(modelName);
|
||||
if (!result.success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
if (endpoint) {
|
||||
await this.syncLocalState(endpoint);
|
||||
}
|
||||
|
||||
const resolved = await this.resolveModel(result.model);
|
||||
if (!resolved) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
...resolved,
|
||||
created: !result.alreadyLoaded,
|
||||
};
|
||||
}
|
||||
|
||||
public async reconcileDesiredReplicas(): Promise<void> {
|
||||
if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const desiredDeployments = this.clusterManager.getDesiredDeployments();
|
||||
for (const desiredDeployment of desiredDeployments) {
|
||||
if (desiredDeployment.desiredReplicas <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
|
||||
if (!model) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
|
||||
const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
|
||||
if (missingReplicas <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let index = 0; index < missingReplicas; index++) {
|
||||
const targetNode = this.clusterManager.pickNodeForModel(model);
|
||||
if (!targetNode) {
|
||||
break;
|
||||
}
|
||||
|
||||
const replicaOrdinal = existingLocations.length + index;
|
||||
const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
|
||||
? await this.deployReplicaLocally(model.id, replicaOrdinal)
|
||||
: await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);
|
||||
|
||||
if (!result) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public async deployReplicaLocally(
|
||||
modelName: string,
|
||||
replicaOrdinal?: number,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
if (model) {
|
||||
this.rememberDesiredDeployment(
|
||||
model.id,
|
||||
Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
|
||||
);
|
||||
}
|
||||
|
||||
const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
|
||||
if (!result.success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpoint = this.clusterManager.getAdvertisedEndpoint();
|
||||
if (endpoint) {
|
||||
await this.syncLocalState(endpoint);
|
||||
}
|
||||
|
||||
const resolved = await this.resolveModel(result.model);
|
||||
if (!resolved) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
...resolved,
|
||||
created: !result.alreadyLoaded,
|
||||
};
|
||||
}
|
||||
|
||||
private async requestRemoteDeployment(
|
||||
nodeEndpoint: string,
|
||||
modelName: string,
|
||||
replicaOrdinal?: number,
|
||||
): Promise<IClusterEnsureResponse | null> {
|
||||
try {
|
||||
const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
|
||||
method: 'POST',
|
||||
headers: this.buildClusterHeaders(),
|
||||
body: JSON.stringify({ model: modelName, replicaOrdinal }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.json() as IClusterEnsureResponse;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async buildResourceSummary(
|
||||
gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
|
||||
deploymentCount: number,
|
||||
_models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
|
||||
runningContainers: ReturnType<ContainerManager['getAllContainers']>,
|
||||
): Promise<IClusterNodeResources> {
|
||||
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
|
||||
const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
|
||||
const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
|
||||
const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
|
||||
const availableVramGb = Math.round(
|
||||
availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
|
||||
);
|
||||
|
||||
const maxSingleGpuVramGb = availableGpus.length > 0
|
||||
? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
|
||||
: 0;
|
||||
const largestGpuGroupCount = topologyGroups.length > 0
|
||||
? Math.max(...topologyGroups.map((group) => group.gpuCount))
|
||||
: 0;
|
||||
const largestGpuGroupVramGb = topologyGroups.length > 0
|
||||
? Math.max(...topologyGroups.map((group) => group.totalVramGb))
|
||||
: 0;
|
||||
|
||||
return {
|
||||
gpuCount: gpus.length,
|
||||
totalVramGb,
|
||||
availableVramGb,
|
||||
maxSingleGpuVramGb,
|
||||
largestGpuGroupCount,
|
||||
largestGpuGroupVramGb,
|
||||
deploymentCount,
|
||||
topologyGroups,
|
||||
};
|
||||
}
|
||||
|
||||
private buildClusterHeaders(): Record<string, string> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
const sharedSecret = this.clusterManager.getSharedSecret();
|
||||
if (sharedSecret) {
|
||||
headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
|
||||
}
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
|
||||
const existing = this.clusterManager.getDesiredDeployment(modelId);
|
||||
const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
|
||||
this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
export { ClusterManager } from './cluster-manager.ts';
|
||||
export { ClusterCoordinator } from './coordinator.ts';
|
||||
@@ -0,0 +1,114 @@
|
||||
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
|
||||
import type { IGpuInfo, TGpuVendor } from '../interfaces/gpu.ts';
|
||||
import type { IClusterGpuTopologyGroup } from '../interfaces/cluster.ts';
|
||||
|
||||
function parsePciBusNumber(gpu: IGpuInfo): number {
|
||||
const source = gpu.pciBusId || gpu.pciSlot;
|
||||
const match = source.match(/(?:[0-9a-f]{4}:)?([0-9a-f]{2}):/i);
|
||||
if (!match) {
|
||||
return gpu.index;
|
||||
}
|
||||
|
||||
return parseInt(match[1], 16);
|
||||
}
|
||||
|
||||
export function buildGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
|
||||
const sorted = [...gpus].sort((left, right) => {
|
||||
if (left.vendor !== right.vendor) {
|
||||
return left.vendor.localeCompare(right.vendor);
|
||||
}
|
||||
|
||||
return parsePciBusNumber(left) - parsePciBusNumber(right);
|
||||
});
|
||||
|
||||
const groups: IClusterGpuTopologyGroup[] = [];
|
||||
|
||||
for (const gpu of sorted) {
|
||||
const busNumber = parsePciBusNumber(gpu);
|
||||
const previousGroup = groups[groups.length - 1];
|
||||
const previousBus = previousGroup?.busNumbers[previousGroup.busNumbers.length - 1];
|
||||
|
||||
const belongsToPreviousGroup = previousGroup &&
|
||||
previousGroup.vendor === gpu.vendor &&
|
||||
previousBus !== undefined &&
|
||||
busNumber - previousBus <= 1;
|
||||
|
||||
if (belongsToPreviousGroup) {
|
||||
previousGroup.gpuIds.push(gpu.id);
|
||||
previousGroup.busNumbers.push(busNumber);
|
||||
previousGroup.totalVramGb += Math.round(gpu.vram / 1024);
|
||||
previousGroup.maxSingleGpuVramGb = Math.max(
|
||||
previousGroup.maxSingleGpuVramGb,
|
||||
Math.round(gpu.vram / 1024),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
groups.push({
|
||||
id: `${gpu.vendor}-${groups.length + 1}`,
|
||||
vendor: gpu.vendor,
|
||||
gpuIds: [gpu.id],
|
||||
gpuCount: 1,
|
||||
totalVramGb: Math.round(gpu.vram / 1024),
|
||||
maxSingleGpuVramGb: Math.round(gpu.vram / 1024),
|
||||
busNumbers: [busNumber],
|
||||
});
|
||||
}
|
||||
|
||||
for (const group of groups) {
|
||||
group.gpuCount = group.gpuIds.length;
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
export function summarizeGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
|
||||
return buildGpuTopologyGroups(gpus);
|
||||
}
|
||||
|
||||
export function selectPlacementForModel(
|
||||
model: IModelCatalogEntry,
|
||||
gpus: IGpuInfo[],
|
||||
): { gpuIds: string[]; tensorParallelSize: number; topologyGroupId: string } | null {
|
||||
const minGpuCount = model.requirements.minGpuCount || 1;
|
||||
const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
|
||||
const topologyGroups = buildGpuTopologyGroups(gpus);
|
||||
|
||||
const eligibleGroups = topologyGroups.filter((group) =>
|
||||
group.gpuCount >= minGpuCount && group.totalVramGb >= model.requirements.minVramGb
|
||||
);
|
||||
|
||||
if (eligibleGroups.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
eligibleGroups.sort((left, right) => {
|
||||
const leftCountDelta = Math.abs(left.gpuCount - preferredTensorParallel);
|
||||
const rightCountDelta = Math.abs(right.gpuCount - preferredTensorParallel);
|
||||
if (leftCountDelta !== rightCountDelta) {
|
||||
return leftCountDelta - rightCountDelta;
|
||||
}
|
||||
|
||||
const leftVramDelta = left.totalVramGb - model.requirements.minVramGb;
|
||||
const rightVramDelta = right.totalVramGb - model.requirements.minVramGb;
|
||||
if (leftVramDelta !== rightVramDelta) {
|
||||
return leftVramDelta - rightVramDelta;
|
||||
}
|
||||
|
||||
return left.id.localeCompare(right.id);
|
||||
});
|
||||
|
||||
const selectedGroup = eligibleGroups[0];
|
||||
const tensorParallelSize = Math.min(preferredTensorParallel, selectedGroup.gpuCount);
|
||||
|
||||
return {
|
||||
gpuIds: selectedGroup.gpuIds.slice(0, tensorParallelSize),
|
||||
tensorParallelSize,
|
||||
topologyGroupId: selectedGroup.id,
|
||||
};
|
||||
}
|
||||
|
||||
export function filterOutUsedGpus(gpus: IGpuInfo[], usedGpuIds: string[]): IGpuInfo[] {
|
||||
const usedSet = new Set(usedGpuIds);
|
||||
return gpus.filter((gpu) => !usedSet.has(gpu.id));
|
||||
}
|
||||
Reference in New Issue
Block a user