feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+456
View File
@@ -0,0 +1,456 @@
import os from 'node:os';
import * as fs from 'node:fs/promises';
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
import type {
IClusterConfig,
IClusterDesiredDeployment,
IClusterGpuTopologyGroup,
IClusterModelLocation,
IClusterNodeHeartbeat,
IClusterNodeStatus,
IClusterStatusResponse,
TClusterNodeSchedulerState,
} from '../interfaces/cluster.ts';
import { CLUSTER, PATHS } from '../constants.ts';
export class ClusterManager {
private config: IClusterConfig = {
enabled: false,
nodeName: os.hostname(),
role: 'standalone',
bindHost: CLUSTER.DEFAULT_BIND_HOST,
gossipPort: CLUSTER.DEFAULT_GOSSIP_PORT,
heartbeatIntervalMs: CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
seedNodes: [],
};
private localNode: IClusterNodeHeartbeat | null = null;
private knownNodes = new Map<string, IClusterNodeHeartbeat>();
private desiredDeployments = new Map<string, IClusterDesiredDeployment>();
private nodeSchedulerStates = new Map<string, TClusterNodeSchedulerState>();
private persistQueued = false;
private controlPersistQueued = false;
public async initialize(): Promise<void> {
try {
const stateContent = await fs.readFile(this.getStateFilePath(), 'utf-8');
const data = JSON.parse(stateContent) as { nodes?: IClusterNodeHeartbeat[] };
for (const node of data.nodes || []) {
this.knownNodes.set(node.nodeName, node);
if (node.nodeName === this.config.nodeName) {
this.localNode = node;
}
}
this.pruneStaleNodes();
} catch {
// No persisted cluster state yet.
}
try {
const controlStateContent = await fs.readFile(this.getControlStateFilePath(), 'utf-8');
const data = JSON.parse(controlStateContent) as {
desiredDeployments?: IClusterDesiredDeployment[];
nodeSchedulerStates?: Record<string, TClusterNodeSchedulerState>;
};
for (const deployment of data.desiredDeployments || []) {
this.desiredDeployments.set(deployment.modelId, deployment);
}
for (const [nodeName, schedulerState] of Object.entries(data.nodeSchedulerStates || {})) {
this.nodeSchedulerStates.set(nodeName, schedulerState);
}
} catch {
// No persisted control state yet.
}
}
public configure(config: IClusterConfig): void {
this.config = {
...config,
heartbeatIntervalMs: config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS,
seedNodes: config.seedNodes || [],
};
}
public getConfig(): IClusterConfig {
return this.config;
}
public isEnabled(): boolean {
return this.config.enabled;
}
public isControlPlane(): boolean {
return this.config.enabled && this.config.role === 'control-plane';
}
public isWorker(): boolean {
return this.config.enabled && this.config.role === 'worker';
}
public getModeLabel(): string {
if (!this.config.enabled) {
return 'standalone';
}
return this.config.role;
}
public getHeartbeatIntervalMs(): number {
return this.config.heartbeatIntervalMs || CLUSTER.DEFAULT_HEARTBEAT_INTERVAL_MS;
}
public getAdvertisedEndpoint(): string | undefined {
return this.localNode?.endpoint || this.config.advertiseUrl;
}
public getControlPlaneUrl(): string | undefined {
return this.config.controlPlaneUrl;
}
public getSharedSecret(): string | undefined {
return this.config.sharedSecret || undefined;
}
public updateLocalNode(heartbeat: IClusterNodeHeartbeat): void {
this.localNode = heartbeat;
this.knownNodes.set(heartbeat.nodeName, heartbeat);
this.schedulePersist();
}
public upsertNode(heartbeat: IClusterNodeHeartbeat): void {
this.knownNodes.set(heartbeat.nodeName, heartbeat);
this.schedulePersist();
}
public getLocalNodeStatus(): IClusterNodeStatus {
return {
nodeName: this.config.nodeName,
role: this.config.role,
endpoint: this.getAdvertisedEndpoint(),
healthy: true,
schedulerState: this.getNodeSchedulerState(this.config.nodeName),
};
}
public getLocalNode(): IClusterNodeHeartbeat | null {
return this.localNode;
}
public getNode(nodeName: string): IClusterNodeHeartbeat | null {
const node = this.knownNodes.get(nodeName);
if (!node) {
return null;
}
return this.decorateNode(node);
}
public pruneStaleNodes(): void {
const now = Date.now();
for (const [nodeName, node] of this.knownNodes) {
if (nodeName === this.config.nodeName) {
continue;
}
if (now - node.lastSeenAt > CLUSTER.NODE_STALE_AFTER_MS) {
this.knownNodes.delete(nodeName);
this.schedulePersist();
}
}
}
public getAllNodes(): IClusterNodeHeartbeat[] {
this.pruneStaleNodes();
return Array.from(this.knownNodes.values()).map((node) => this.decorateNode(node)).sort(
(left, right) => {
if (left.nodeName === this.config.nodeName) {
return -1;
}
if (right.nodeName === this.config.nodeName) {
return 1;
}
return left.nodeName.localeCompare(right.nodeName);
},
);
}
public getHealthyNodes(): IClusterNodeHeartbeat[] {
return this.getAllNodes().filter((node) => node.healthy);
}
public getNodeSchedulerState(nodeName: string): TClusterNodeSchedulerState {
return this.nodeSchedulerStates.get(nodeName) || 'active';
}
public setNodeSchedulerState(
nodeName: string,
schedulerState: TClusterNodeSchedulerState,
): TClusterNodeSchedulerState {
this.nodeSchedulerStates.set(nodeName, schedulerState);
this.scheduleControlPersist();
return schedulerState;
}
public getDesiredDeployments(): IClusterDesiredDeployment[] {
return Array.from(this.desiredDeployments.values()).sort((left, right) =>
left.modelId.localeCompare(right.modelId)
);
}
public getDesiredDeployment(modelId: string): IClusterDesiredDeployment | null {
return this.desiredDeployments.get(modelId) || null;
}
public upsertDesiredDeployment(
modelId: string,
desiredReplicas: number,
): IClusterDesiredDeployment {
const deployment: IClusterDesiredDeployment = {
modelId,
desiredReplicas,
updatedAt: Date.now(),
};
this.desiredDeployments.set(modelId, deployment);
this.scheduleControlPersist();
return deployment;
}
public removeDesiredDeployment(modelId: string): boolean {
const removed = this.desiredDeployments.delete(modelId);
if (removed) {
this.scheduleControlPersist();
}
return removed;
}
public getModelLocations(modelId: string): IClusterModelLocation[] {
const locations: IClusterModelLocation[] = [];
for (const node of this.getHealthyNodes()) {
for (const deployment of node.deployments) {
if (deployment.modelId !== modelId || !deployment.healthy) {
continue;
}
locations.push({
modelId,
nodeName: node.nodeName,
endpoint: deployment.endpoint,
healthy: deployment.healthy,
engine: deployment.engine,
containerId: deployment.containerId,
});
}
}
return locations;
}
public getActiveModelLocations(modelId: string): IClusterModelLocation[] {
return this.getModelLocations(modelId).filter((location) =>
this.getNodeSchedulerState(location.nodeName) === 'active'
);
}
public resolveModel(modelId: string): IClusterModelLocation | null {
const locations = this.getModelLocations(modelId);
if (locations.length === 0) {
return null;
}
locations.sort((left, right) => {
const schedulerPreference = this.compareSchedulerState(
this.getNodeSchedulerState(left.nodeName),
this.getNodeSchedulerState(right.nodeName),
);
if (schedulerPreference !== 0) {
return schedulerPreference;
}
if (left.nodeName === this.config.nodeName) {
return -1;
}
if (right.nodeName === this.config.nodeName) {
return 1;
}
return left.nodeName.localeCompare(right.nodeName);
});
return locations[0];
}
public pickNodeForModel(
model: IModelCatalogEntry,
excludedNodeNames: string[] = [],
): IClusterNodeHeartbeat | null {
const requiredVram = model.requirements.minVramGb;
const minGpuCount = model.requirements.minGpuCount || 1;
const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
const eligible = this.getHealthyNodes().filter((node) => {
if (excludedNodeNames.includes(node.nodeName)) {
return false;
}
if (node.role === 'standalone' && node.nodeName !== this.config.nodeName) {
return false;
}
if (node.schedulerState && node.schedulerState !== 'active') {
return false;
}
return node.resources.availableVramGb >= requiredVram &&
this.hasEligibleTopologyGroup(node.resources.topologyGroups, requiredVram, minGpuCount);
});
if (eligible.length === 0) {
return null;
}
eligible.sort((left, right) => {
if (left.nodeName === this.config.nodeName) {
return -1;
}
if (right.nodeName === this.config.nodeName) {
return 1;
}
if (right.resources.availableVramGb !== left.resources.availableVramGb) {
return right.resources.availableVramGb - left.resources.availableVramGb;
}
const leftTopologyDelta = Math.abs(
left.resources.largestGpuGroupCount - preferredTensorParallel,
);
const rightTopologyDelta = Math.abs(
right.resources.largestGpuGroupCount - preferredTensorParallel,
);
if (leftTopologyDelta !== rightTopologyDelta) {
return leftTopologyDelta - rightTopologyDelta;
}
return left.resources.deploymentCount - right.resources.deploymentCount;
});
return eligible[0];
}
public getStatus(): IClusterStatusResponse {
const models: Record<string, IClusterModelLocation[]> = {};
for (const node of this.getHealthyNodes()) {
for (const deployment of node.deployments) {
if (!models[deployment.modelId]) {
models[deployment.modelId] = [];
}
models[deployment.modelId].push({
modelId: deployment.modelId,
nodeName: node.nodeName,
endpoint: deployment.endpoint,
healthy: deployment.healthy,
engine: deployment.engine,
containerId: deployment.containerId,
});
}
}
return {
localNode: this.localNode ? this.decorateNode(this.localNode) : null,
nodes: this.getAllNodes(),
models,
desiredDeployments: this.getDesiredDeployments(),
};
}
private hasEligibleTopologyGroup(
groups: IClusterGpuTopologyGroup[],
requiredVramGb: number,
minGpuCount: number,
): boolean {
return groups.some((group) =>
group.gpuCount >= minGpuCount && group.totalVramGb >= requiredVramGb
);
}
private getStateFilePath(): string {
return `${PATHS.DATA_DIR}/cluster-state.json`;
}
private getControlStateFilePath(): string {
return `${PATHS.DATA_DIR}/cluster-control-state.json`;
}
private schedulePersist(): void {
if (this.persistQueued) {
return;
}
this.persistQueued = true;
queueMicrotask(() => {
this.persistQueued = false;
void this.persistState();
});
}
private scheduleControlPersist(): void {
if (this.controlPersistQueued) {
return;
}
this.controlPersistQueued = true;
queueMicrotask(() => {
this.controlPersistQueued = false;
void this.persistControlState();
});
}
private async persistState(): Promise<void> {
try {
await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
await fs.writeFile(
this.getStateFilePath(),
JSON.stringify({ nodes: Array.from(this.knownNodes.values()) }, null, 2),
);
} catch {
// Persistence failure should not break the control plane.
}
}
private async persistControlState(): Promise<void> {
try {
await fs.mkdir(PATHS.DATA_DIR, { recursive: true });
await fs.writeFile(
this.getControlStateFilePath(),
JSON.stringify(
{
desiredDeployments: this.getDesiredDeployments(),
nodeSchedulerStates: Object.fromEntries(this.nodeSchedulerStates.entries()),
},
null,
2,
),
);
} catch {
// Persistence failure should not break the control plane.
}
}
private decorateNode(node: IClusterNodeHeartbeat): IClusterNodeHeartbeat {
return {
...node,
schedulerState: this.getNodeSchedulerState(node.nodeName),
};
}
private compareSchedulerState(
left: TClusterNodeSchedulerState,
right: TClusterNodeSchedulerState,
): number {
const order: TClusterNodeSchedulerState[] = ['active', 'cordoned', 'draining'];
return order.indexOf(left) - order.indexOf(right);
}
}
+438
View File
@@ -0,0 +1,438 @@
import type {
IClusterDesiredDeployment,
IClusterEnsureResponse,
IClusterNodeHeartbeat,
IClusterNodeResources,
IClusterStatusResponse,
TClusterNodeSchedulerState,
} from '../interfaces/cluster.ts';
import { ContainerManager } from '../containers/container-manager.ts';
import { GpuDetector } from '../hardware/gpu-detector.ts';
import { logger } from '../logger.ts';
import { ModelRegistry } from '../models/registry.ts';
import { ModelLoader } from '../models/loader.ts';
import { CLUSTER } from '../constants.ts';
import { filterOutUsedGpus, summarizeGpuTopologyGroups } from './placement.ts';
import { ClusterManager } from './cluster-manager.ts';
export class ClusterCoordinator {
private clusterManager: ClusterManager;
private containerManager: ContainerManager;
private modelRegistry: ModelRegistry;
private modelLoader: ModelLoader;
private gpuDetector: GpuDetector;
constructor(
clusterManager: ClusterManager,
containerManager: ContainerManager,
modelRegistry: ModelRegistry,
modelLoader: ModelLoader,
) {
this.clusterManager = clusterManager;
this.containerManager = containerManager;
this.modelRegistry = modelRegistry;
this.modelLoader = modelLoader;
this.gpuDetector = new GpuDetector();
}
public async buildLocalHeartbeat(endpoint: string): Promise<IClusterNodeHeartbeat> {
const [gpus, statuses, models] = await Promise.all([
this.gpuDetector.detectGpus(),
this.containerManager.getAllStatus(),
this.containerManager.getAllAvailableModels(),
]);
const deploymentCount = Array.from(statuses.values()).filter((status) => status.running).length;
const runningContainers = this.containerManager.getAllContainers().filter((container) => {
const status = statuses.get(container.getConfig().id);
return status?.running === true;
});
const resources = await this.buildResourceSummary(
gpus,
deploymentCount,
models,
runningContainers,
);
return {
nodeName: this.clusterManager.getConfig().nodeName,
role: this.clusterManager.getConfig().role,
endpoint,
healthy: true,
resources,
deployments: Array.from(models.entries()).map(([modelId, endpoints]) => ({
modelId,
engine: 'vllm' as const,
endpoint,
healthy: endpoints.some((entry) => entry.healthy),
containerId: endpoints[0]?.containerId,
})),
lastSeenAt: Date.now(),
};
}
public async syncLocalState(endpoint: string): Promise<IClusterNodeHeartbeat> {
const heartbeat = await this.buildLocalHeartbeat(endpoint);
this.clusterManager.updateLocalNode(heartbeat);
return heartbeat;
}
public async sendHeartbeat(): Promise<void> {
if (!this.clusterManager.isEnabled()) {
return;
}
const endpoint = this.clusterManager.getAdvertisedEndpoint();
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
if (!endpoint || !controlPlaneUrl) {
return;
}
if (controlPlaneUrl === endpoint) {
return;
}
const heartbeat = await this.syncLocalState(endpoint);
try {
await fetch(`${controlPlaneUrl}/_cluster/nodes/heartbeat`, {
method: 'POST',
headers: this.buildClusterHeaders(),
body: JSON.stringify(heartbeat),
});
} catch (error) {
logger.warn(
`Cluster heartbeat failed: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
public acceptHeartbeat(heartbeat: IClusterNodeHeartbeat): void {
this.clusterManager.upsertNode(heartbeat);
}
public getStatus(): IClusterStatusResponse {
return this.clusterManager.getStatus();
}
public getDesiredDeployments(): IClusterDesiredDeployment[] {
return this.clusterManager.getDesiredDeployments();
}
public getLocalNodeName(): string {
return this.clusterManager.getConfig().nodeName;
}
public getSharedSecret(): string | undefined {
return this.clusterManager.getSharedSecret();
}
public setNodeSchedulerState(
nodeName: string,
schedulerState: TClusterNodeSchedulerState,
): TClusterNodeSchedulerState {
return this.clusterManager.setNodeSchedulerState(nodeName, schedulerState);
}
public async setDesiredReplicas(
modelName: string,
desiredReplicas: number,
): Promise<IClusterDesiredDeployment | null> {
const model = await this.modelRegistry.getModel(modelName);
if (!model) {
return null;
}
if (desiredReplicas <= 0) {
this.clusterManager.removeDesiredDeployment(model.id);
return {
modelId: model.id,
desiredReplicas: 0,
updatedAt: Date.now(),
};
}
return this.clusterManager.upsertDesiredDeployment(model.id, Math.max(desiredReplicas, 0));
}
public async clearDesiredDeployment(modelName: string): Promise<boolean> {
const model = await this.modelRegistry.getModel(modelName);
if (!model) {
return false;
}
return this.clusterManager.removeDesiredDeployment(model.id);
}
public shouldDeployLocallyFirst(): boolean {
if (!this.clusterManager.isEnabled()) {
return true;
}
return this.clusterManager.isControlPlane() || !this.clusterManager.getControlPlaneUrl();
}
public canManageClusterState(): boolean {
return !this.clusterManager.isEnabled() || this.clusterManager.isControlPlane();
}
public async resolveModel(modelName: string): Promise<IClusterEnsureResponse | null> {
const model = await this.modelRegistry.getModel(modelName);
if (!model) {
return null;
}
const location = this.clusterManager.resolveModel(model.id);
if (!location) {
return null;
}
return {
model: model.id,
location,
created: false,
};
}
public async ensureModel(modelName: string): Promise<IClusterEnsureResponse | null> {
const model = await this.modelRegistry.getModel(modelName);
if (!model) {
return null;
}
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
const existing = this.clusterManager.getActiveModelLocations(model.id)[0];
if (existing) {
return {
model: model.id,
location: existing,
created: false,
};
}
if (!this.clusterManager.isEnabled() || !this.clusterManager.isControlPlane()) {
const local = await this.deployModelLocally(model.id);
if (!local) {
return null;
}
return local;
}
const targetNode = this.clusterManager.pickNodeForModel(model);
if (!targetNode) {
return null;
}
if (targetNode.nodeName === this.clusterManager.getConfig().nodeName) {
return this.deployModelLocally(model.id);
}
return this.requestRemoteDeployment(targetNode.endpoint, model.id);
}
public async ensureModelViaControlPlane(
modelName: string,
): Promise<IClusterEnsureResponse | null> {
const controlPlaneUrl = this.clusterManager.getControlPlaneUrl();
const localEndpoint = this.clusterManager.getAdvertisedEndpoint();
if (!controlPlaneUrl || controlPlaneUrl === localEndpoint) {
return this.ensureModel(modelName);
}
try {
const response = await fetch(`${controlPlaneUrl}/_cluster/models/ensure`, {
method: 'POST',
headers: this.buildClusterHeaders(),
body: JSON.stringify({ model: modelName }),
});
if (!response.ok) {
return null;
}
return await response.json() as IClusterEnsureResponse;
} catch {
return null;
}
}
public async deployModelLocally(modelName: string): Promise<IClusterEnsureResponse | null> {
const model = await this.modelRegistry.getModel(modelName);
if (model) {
this.rememberDesiredDeployment(model.id, model.launchDefaults?.replicas || 1);
}
const result = await this.modelLoader.loadModel(modelName);
if (!result.success) {
return null;
}
const endpoint = this.clusterManager.getAdvertisedEndpoint();
if (endpoint) {
await this.syncLocalState(endpoint);
}
const resolved = await this.resolveModel(result.model);
if (!resolved) {
return null;
}
return {
...resolved,
created: !result.alreadyLoaded,
};
}
public async reconcileDesiredReplicas(): Promise<void> {
if (this.clusterManager.isEnabled() && !this.clusterManager.isControlPlane()) {
return;
}
const desiredDeployments = this.clusterManager.getDesiredDeployments();
for (const desiredDeployment of desiredDeployments) {
if (desiredDeployment.desiredReplicas <= 0) {
continue;
}
const model = await this.modelRegistry.getModel(desiredDeployment.modelId);
if (!model) {
continue;
}
const existingLocations = this.clusterManager.getActiveModelLocations(model.id);
const missingReplicas = desiredDeployment.desiredReplicas - existingLocations.length;
if (missingReplicas <= 0) {
continue;
}
for (let index = 0; index < missingReplicas; index++) {
const targetNode = this.clusterManager.pickNodeForModel(model);
if (!targetNode) {
break;
}
const replicaOrdinal = existingLocations.length + index;
const result = targetNode.nodeName === this.clusterManager.getConfig().nodeName
? await this.deployReplicaLocally(model.id, replicaOrdinal)
: await this.requestRemoteDeployment(targetNode.endpoint, model.id, replicaOrdinal);
if (!result) {
break;
}
}
}
}
public async deployReplicaLocally(
modelName: string,
replicaOrdinal?: number,
): Promise<IClusterEnsureResponse | null> {
const model = await this.modelRegistry.getModel(modelName);
if (model) {
this.rememberDesiredDeployment(
model.id,
Math.max((replicaOrdinal ?? 0) + 1, model.launchDefaults?.replicas || 1),
);
}
const result = await this.modelLoader.deployReplica(modelName, replicaOrdinal);
if (!result.success) {
return null;
}
const endpoint = this.clusterManager.getAdvertisedEndpoint();
if (endpoint) {
await this.syncLocalState(endpoint);
}
const resolved = await this.resolveModel(result.model);
if (!resolved) {
return null;
}
return {
...resolved,
created: !result.alreadyLoaded,
};
}
private async requestRemoteDeployment(
nodeEndpoint: string,
modelName: string,
replicaOrdinal?: number,
): Promise<IClusterEnsureResponse | null> {
try {
const response = await fetch(`${nodeEndpoint}/_cluster/deployments`, {
method: 'POST',
headers: this.buildClusterHeaders(),
body: JSON.stringify({ model: modelName, replicaOrdinal }),
});
if (!response.ok) {
return null;
}
return await response.json() as IClusterEnsureResponse;
} catch {
return null;
}
}
private async buildResourceSummary(
gpus: Awaited<ReturnType<GpuDetector['detectGpus']>>,
deploymentCount: number,
_models: Awaited<ReturnType<ContainerManager['getAllAvailableModels']>>,
runningContainers: ReturnType<ContainerManager['getAllContainers']>,
): Promise<IClusterNodeResources> {
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
const usedGpuIds = runningContainers.flatMap((container) => container.getConfig().gpuIds);
const availableGpus = filterOutUsedGpus(gpus, usedGpuIds);
const topologyGroups = summarizeGpuTopologyGroups(availableGpus);
const availableVramGb = Math.round(
availableGpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024,
);
const maxSingleGpuVramGb = availableGpus.length > 0
? Math.max(...availableGpus.map((gpu) => Math.round(gpu.vram / 1024)))
: 0;
const largestGpuGroupCount = topologyGroups.length > 0
? Math.max(...topologyGroups.map((group) => group.gpuCount))
: 0;
const largestGpuGroupVramGb = topologyGroups.length > 0
? Math.max(...topologyGroups.map((group) => group.totalVramGb))
: 0;
return {
gpuCount: gpus.length,
totalVramGb,
availableVramGb,
maxSingleGpuVramGb,
largestGpuGroupCount,
largestGpuGroupVramGb,
deploymentCount,
topologyGroups,
};
}
private buildClusterHeaders(): Record<string, string> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
const sharedSecret = this.clusterManager.getSharedSecret();
if (sharedSecret) {
headers[CLUSTER.AUTH_HEADER_NAME] = sharedSecret;
}
return headers;
}
private rememberDesiredDeployment(modelId: string, minimumReplicas: number): void {
const existing = this.clusterManager.getDesiredDeployment(modelId);
const desiredReplicas = Math.max(existing?.desiredReplicas || 0, minimumReplicas, 1);
this.clusterManager.upsertDesiredDeployment(modelId, desiredReplicas);
}
}
+2
View File
@@ -0,0 +1,2 @@
export { ClusterManager } from './cluster-manager.ts';
export { ClusterCoordinator } from './coordinator.ts';
+114
View File
@@ -0,0 +1,114 @@
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
import type { IGpuInfo, TGpuVendor } from '../interfaces/gpu.ts';
import type { IClusterGpuTopologyGroup } from '../interfaces/cluster.ts';
function parsePciBusNumber(gpu: IGpuInfo): number {
const source = gpu.pciBusId || gpu.pciSlot;
const match = source.match(/(?:[0-9a-f]{4}:)?([0-9a-f]{2}):/i);
if (!match) {
return gpu.index;
}
return parseInt(match[1], 16);
}
export function buildGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
const sorted = [...gpus].sort((left, right) => {
if (left.vendor !== right.vendor) {
return left.vendor.localeCompare(right.vendor);
}
return parsePciBusNumber(left) - parsePciBusNumber(right);
});
const groups: IClusterGpuTopologyGroup[] = [];
for (const gpu of sorted) {
const busNumber = parsePciBusNumber(gpu);
const previousGroup = groups[groups.length - 1];
const previousBus = previousGroup?.busNumbers[previousGroup.busNumbers.length - 1];
const belongsToPreviousGroup = previousGroup &&
previousGroup.vendor === gpu.vendor &&
previousBus !== undefined &&
busNumber - previousBus <= 1;
if (belongsToPreviousGroup) {
previousGroup.gpuIds.push(gpu.id);
previousGroup.busNumbers.push(busNumber);
previousGroup.totalVramGb += Math.round(gpu.vram / 1024);
previousGroup.maxSingleGpuVramGb = Math.max(
previousGroup.maxSingleGpuVramGb,
Math.round(gpu.vram / 1024),
);
continue;
}
groups.push({
id: `${gpu.vendor}-${groups.length + 1}`,
vendor: gpu.vendor,
gpuIds: [gpu.id],
gpuCount: 1,
totalVramGb: Math.round(gpu.vram / 1024),
maxSingleGpuVramGb: Math.round(gpu.vram / 1024),
busNumbers: [busNumber],
});
}
for (const group of groups) {
group.gpuCount = group.gpuIds.length;
}
return groups;
}
export function summarizeGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
return buildGpuTopologyGroups(gpus);
}
export function selectPlacementForModel(
model: IModelCatalogEntry,
gpus: IGpuInfo[],
): { gpuIds: string[]; tensorParallelSize: number; topologyGroupId: string } | null {
const minGpuCount = model.requirements.minGpuCount || 1;
const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
const topologyGroups = buildGpuTopologyGroups(gpus);
const eligibleGroups = topologyGroups.filter((group) =>
group.gpuCount >= minGpuCount && group.totalVramGb >= model.requirements.minVramGb
);
if (eligibleGroups.length === 0) {
return null;
}
eligibleGroups.sort((left, right) => {
const leftCountDelta = Math.abs(left.gpuCount - preferredTensorParallel);
const rightCountDelta = Math.abs(right.gpuCount - preferredTensorParallel);
if (leftCountDelta !== rightCountDelta) {
return leftCountDelta - rightCountDelta;
}
const leftVramDelta = left.totalVramGb - model.requirements.minVramGb;
const rightVramDelta = right.totalVramGb - model.requirements.minVramGb;
if (leftVramDelta !== rightVramDelta) {
return leftVramDelta - rightVramDelta;
}
return left.id.localeCompare(right.id);
});
const selectedGroup = eligibleGroups[0];
const tensorParallelSize = Math.min(preferredTensorParallel, selectedGroup.gpuCount);
return {
gpuIds: selectedGroup.gpuIds.slice(0, tensorParallelSize),
tensorParallelSize,
topologyGroupId: selectedGroup.id,
};
}
export function filterOutUsedGpus(gpus: IGpuInfo[], usedGpuIds: string[]): IGpuInfo[] {
const usedSet = new Set(usedGpuIds);
return gpus.filter((gpu) => !usedSet.has(gpu.id));
}