feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -0,0 +1,114 @@
|
||||
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
|
||||
import type { IGpuInfo, TGpuVendor } from '../interfaces/gpu.ts';
|
||||
import type { IClusterGpuTopologyGroup } from '../interfaces/cluster.ts';
|
||||
|
||||
function parsePciBusNumber(gpu: IGpuInfo): number {
|
||||
const source = gpu.pciBusId || gpu.pciSlot;
|
||||
const match = source.match(/(?:[0-9a-f]{4}:)?([0-9a-f]{2}):/i);
|
||||
if (!match) {
|
||||
return gpu.index;
|
||||
}
|
||||
|
||||
return parseInt(match[1], 16);
|
||||
}
|
||||
|
||||
export function buildGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
|
||||
const sorted = [...gpus].sort((left, right) => {
|
||||
if (left.vendor !== right.vendor) {
|
||||
return left.vendor.localeCompare(right.vendor);
|
||||
}
|
||||
|
||||
return parsePciBusNumber(left) - parsePciBusNumber(right);
|
||||
});
|
||||
|
||||
const groups: IClusterGpuTopologyGroup[] = [];
|
||||
|
||||
for (const gpu of sorted) {
|
||||
const busNumber = parsePciBusNumber(gpu);
|
||||
const previousGroup = groups[groups.length - 1];
|
||||
const previousBus = previousGroup?.busNumbers[previousGroup.busNumbers.length - 1];
|
||||
|
||||
const belongsToPreviousGroup = previousGroup &&
|
||||
previousGroup.vendor === gpu.vendor &&
|
||||
previousBus !== undefined &&
|
||||
busNumber - previousBus <= 1;
|
||||
|
||||
if (belongsToPreviousGroup) {
|
||||
previousGroup.gpuIds.push(gpu.id);
|
||||
previousGroup.busNumbers.push(busNumber);
|
||||
previousGroup.totalVramGb += Math.round(gpu.vram / 1024);
|
||||
previousGroup.maxSingleGpuVramGb = Math.max(
|
||||
previousGroup.maxSingleGpuVramGb,
|
||||
Math.round(gpu.vram / 1024),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
groups.push({
|
||||
id: `${gpu.vendor}-${groups.length + 1}`,
|
||||
vendor: gpu.vendor,
|
||||
gpuIds: [gpu.id],
|
||||
gpuCount: 1,
|
||||
totalVramGb: Math.round(gpu.vram / 1024),
|
||||
maxSingleGpuVramGb: Math.round(gpu.vram / 1024),
|
||||
busNumbers: [busNumber],
|
||||
});
|
||||
}
|
||||
|
||||
for (const group of groups) {
|
||||
group.gpuCount = group.gpuIds.length;
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
export function summarizeGpuTopologyGroups(gpus: IGpuInfo[]): IClusterGpuTopologyGroup[] {
|
||||
return buildGpuTopologyGroups(gpus);
|
||||
}
|
||||
|
||||
export function selectPlacementForModel(
|
||||
model: IModelCatalogEntry,
|
||||
gpus: IGpuInfo[],
|
||||
): { gpuIds: string[]; tensorParallelSize: number; topologyGroupId: string } | null {
|
||||
const minGpuCount = model.requirements.minGpuCount || 1;
|
||||
const preferredTensorParallel = model.launchDefaults?.tensorParallelSize || minGpuCount;
|
||||
const topologyGroups = buildGpuTopologyGroups(gpus);
|
||||
|
||||
const eligibleGroups = topologyGroups.filter((group) =>
|
||||
group.gpuCount >= minGpuCount && group.totalVramGb >= model.requirements.minVramGb
|
||||
);
|
||||
|
||||
if (eligibleGroups.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
eligibleGroups.sort((left, right) => {
|
||||
const leftCountDelta = Math.abs(left.gpuCount - preferredTensorParallel);
|
||||
const rightCountDelta = Math.abs(right.gpuCount - preferredTensorParallel);
|
||||
if (leftCountDelta !== rightCountDelta) {
|
||||
return leftCountDelta - rightCountDelta;
|
||||
}
|
||||
|
||||
const leftVramDelta = left.totalVramGb - model.requirements.minVramGb;
|
||||
const rightVramDelta = right.totalVramGb - model.requirements.minVramGb;
|
||||
if (leftVramDelta !== rightVramDelta) {
|
||||
return leftVramDelta - rightVramDelta;
|
||||
}
|
||||
|
||||
return left.id.localeCompare(right.id);
|
||||
});
|
||||
|
||||
const selectedGroup = eligibleGroups[0];
|
||||
const tensorParallelSize = Math.min(preferredTensorParallel, selectedGroup.gpuCount);
|
||||
|
||||
return {
|
||||
gpuIds: selectedGroup.gpuIds.slice(0, tensorParallelSize),
|
||||
tensorParallelSize,
|
||||
topologyGroupId: selectedGroup.id,
|
||||
};
|
||||
}
|
||||
|
||||
export function filterOutUsedGpus(gpus: IGpuInfo[], usedGpuIds: string[]): IGpuInfo[] {
|
||||
const usedSet = new Set(usedGpuIds);
|
||||
return gpus.filter((gpu) => !usedSet.has(gpu.id));
|
||||
}
|
||||
Reference in New Issue
Block a user