feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+10
-5
@@ -141,8 +141,8 @@ const containerColumns: ITableColumn[] = [
|
||||
];
|
||||
|
||||
const containerData = [
|
||||
{ id: 'ollama-1', type: 'ollama', status: 'Running', gpu: 'gpu-0', models: '3' },
|
||||
{ id: 'vllm-1', type: 'vllm', status: 'Running', gpu: 'gpu-1', models: '1' },
|
||||
{ id: 'vllm-qwen', type: 'vllm', status: 'Running', gpu: 'gpu-0', models: '1' },
|
||||
{ id: 'vllm-llama', type: 'vllm', status: 'Running', gpu: 'gpu-1', models: '1' },
|
||||
];
|
||||
|
||||
logger.logTable(containerColumns, containerData, 'AI Containers');
|
||||
@@ -183,9 +183,14 @@ const modelColumns: ITableColumn[] = [
|
||||
];
|
||||
|
||||
const modelData = [
|
||||
{ name: 'llama3:8b', container: 'ollama-1', size: '4.7 GB', status: 'Loaded' },
|
||||
{ name: 'mistral:7b', container: 'ollama-1', size: '4.1 GB', status: 'Loaded' },
|
||||
{ name: 'llama3:70b', container: 'vllm-1', size: '40 GB', status: 'Loaded' },
|
||||
{ name: 'Qwen/Qwen2.5-7B-Instruct', container: 'vllm-qwen', size: '15 GB', status: 'Loaded' },
|
||||
{
|
||||
name: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
container: 'vllm-llama',
|
||||
size: '16 GB',
|
||||
status: 'Loaded',
|
||||
},
|
||||
{ name: 'BAAI/bge-m3', container: 'vllm-embed', size: '5 GB', status: 'Loaded' },
|
||||
];
|
||||
|
||||
logger.logTable(modelColumns, modelData, 'Loaded Models');
|
||||
|
||||
+415
-38
@@ -1,5 +1,7 @@
|
||||
import { assert, assertEquals, assertExists } from 'jsr:@std/assert@^1.0.0';
|
||||
import { shortId } from '../ts/helpers/shortid.ts';
|
||||
import { ClusterManager } from '../ts/cluster/cluster-manager.ts';
|
||||
import { buildGpuTopologyGroups } from '../ts/cluster/placement.ts';
|
||||
|
||||
// =============================================================================
|
||||
// UNIT TESTS - ModelGrid Core Components
|
||||
@@ -54,11 +56,22 @@ Deno.test('IModelGridConfig: valid config structure', () => {
|
||||
},
|
||||
containers: [],
|
||||
models: {
|
||||
greenlistUrl: 'https://example.com/greenlit.json',
|
||||
autoPull: true,
|
||||
defaultContainer: 'ollama',
|
||||
registryUrl: 'https://example.com/models.json',
|
||||
autoDeploy: true,
|
||||
defaultEngine: 'vllm' as const,
|
||||
autoLoad: [],
|
||||
},
|
||||
cluster: {
|
||||
enabled: false,
|
||||
nodeName: 'modelgrid-local',
|
||||
role: 'standalone' as const,
|
||||
bindHost: '0.0.0.0',
|
||||
gossipPort: 7946,
|
||||
sharedSecret: '',
|
||||
advertiseUrl: 'http://127.0.0.1:8080',
|
||||
heartbeatIntervalMs: 5000,
|
||||
seedNodes: [],
|
||||
},
|
||||
checkInterval: 30000,
|
||||
};
|
||||
|
||||
@@ -92,13 +105,13 @@ Deno.test('IGpuInfo: valid GPU info structure', () => {
|
||||
|
||||
Deno.test('IContainerConfig: valid container config structure', () => {
|
||||
const container = {
|
||||
id: 'ollama-1',
|
||||
type: 'ollama' as const,
|
||||
name: 'Ollama Container',
|
||||
image: 'ollama/ollama:latest',
|
||||
id: 'vllm-1',
|
||||
type: 'vllm' as const,
|
||||
name: 'vLLM Deployment',
|
||||
image: 'vllm/vllm-openai:latest',
|
||||
gpuIds: ['gpu-0'],
|
||||
port: 11434,
|
||||
models: ['llama3:8b'],
|
||||
port: 8000,
|
||||
models: ['meta-llama/Llama-3.1-8B-Instruct'],
|
||||
};
|
||||
|
||||
assertExists(container.id);
|
||||
@@ -110,53 +123,53 @@ Deno.test('IContainerConfig: valid container config structure', () => {
|
||||
});
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Greenlit Model Tests
|
||||
// Model Catalog Tests
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
Deno.test('Greenlit model validation: valid model passes', () => {
|
||||
const greenlist = {
|
||||
Deno.test('Catalog model validation: valid model passes', () => {
|
||||
const catalog = {
|
||||
version: '1.0',
|
||||
models: [
|
||||
{ name: 'llama3:8b', container: 'ollama', minVram: 8 },
|
||||
{ name: 'mistral:7b', container: 'ollama', minVram: 8 },
|
||||
{ id: 'meta-llama/Llama-3.1-8B-Instruct', engine: 'vllm', requirements: { minVramGb: 18 } },
|
||||
{ id: 'Qwen/Qwen2.5-7B-Instruct', engine: 'vllm', requirements: { minVramGb: 16 } },
|
||||
],
|
||||
};
|
||||
|
||||
const requestedModel = 'llama3:8b';
|
||||
const requestedModel = 'meta-llama/Llama-3.1-8B-Instruct';
|
||||
const availableVram = 24; // GB
|
||||
|
||||
const model = greenlist.models.find((m) => m.name === requestedModel);
|
||||
assertExists(model, 'Model should be in greenlist');
|
||||
assert(availableVram >= model.minVram, 'Should have enough VRAM');
|
||||
const model = catalog.models.find((m) => m.id === requestedModel);
|
||||
assertExists(model, 'Model should be in catalog');
|
||||
assert(availableVram >= model.requirements.minVramGb, 'Should have enough VRAM');
|
||||
});
|
||||
|
||||
Deno.test('Greenlit model validation: insufficient VRAM fails', () => {
|
||||
const greenlist = {
|
||||
Deno.test('Catalog model validation: insufficient VRAM fails', () => {
|
||||
const catalog = {
|
||||
version: '1.0',
|
||||
models: [
|
||||
{ name: 'llama3:70b', container: 'vllm', minVram: 48 },
|
||||
{ id: 'meta-llama/Llama-3.1-70B-Instruct', engine: 'vllm', requirements: { minVramGb: 48 } },
|
||||
],
|
||||
};
|
||||
|
||||
const requestedModel = 'llama3:70b';
|
||||
const requestedModel = 'meta-llama/Llama-3.1-70B-Instruct';
|
||||
const availableVram = 24; // GB
|
||||
|
||||
const model = greenlist.models.find((m) => m.name === requestedModel);
|
||||
assertExists(model, 'Model should be in greenlist');
|
||||
assert(availableVram < model.minVram, 'Should NOT have enough VRAM');
|
||||
const model = catalog.models.find((m) => m.id === requestedModel);
|
||||
assertExists(model, 'Model should be in catalog');
|
||||
assert(availableVram < model.requirements.minVramGb, 'Should NOT have enough VRAM');
|
||||
});
|
||||
|
||||
Deno.test('Greenlit model validation: unlisted model rejected', () => {
|
||||
const greenlist = {
|
||||
Deno.test('Catalog model validation: unlisted model rejected', () => {
|
||||
const catalog = {
|
||||
version: '1.0',
|
||||
models: [
|
||||
{ name: 'llama3:8b', container: 'ollama', minVram: 8 },
|
||||
{ id: 'meta-llama/Llama-3.1-8B-Instruct', engine: 'vllm', requirements: { minVramGb: 18 } },
|
||||
],
|
||||
};
|
||||
|
||||
const requestedModel = 'some-random-model:latest';
|
||||
const model = greenlist.models.find((m) => m.name === requestedModel);
|
||||
assertEquals(model, undefined, 'Model should NOT be in greenlist');
|
||||
const model = catalog.models.find((m) => m.id === requestedModel);
|
||||
assertEquals(model, undefined, 'Model should NOT be in catalog');
|
||||
});
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -223,16 +236,16 @@ Deno.test('Embedding request: array input passes', () => {
|
||||
// Container Type Tests
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
Deno.test('Container types: ollama configuration', () => {
|
||||
const ollamaConfig = {
|
||||
type: 'ollama' as const,
|
||||
image: 'ollama/ollama:latest',
|
||||
defaultPort: 11434,
|
||||
apiPath: '/api',
|
||||
Deno.test('Container types: vllm base configuration', () => {
|
||||
const vllmConfig = {
|
||||
type: 'vllm' as const,
|
||||
image: 'vllm/vllm-openai:latest',
|
||||
defaultPort: 8000,
|
||||
apiPath: '/v1',
|
||||
};
|
||||
|
||||
assertEquals(ollamaConfig.type, 'ollama');
|
||||
assertEquals(ollamaConfig.defaultPort, 11434);
|
||||
assertEquals(vllmConfig.type, 'vllm');
|
||||
assertEquals(vllmConfig.defaultPort, 8000);
|
||||
});
|
||||
|
||||
Deno.test('Container types: vllm configuration', () => {
|
||||
@@ -321,3 +334,367 @@ Deno.test('VRAM calculation: multiple models VRAM sum', () => {
|
||||
const totalVram = models.reduce((sum, m) => sum + m.vram, 0);
|
||||
assertEquals(totalVram, 16);
|
||||
});
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Cluster Scheduling Tests
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
Deno.test('Cluster manager resolves local model first', () => {
|
||||
const clusterManager = new ClusterManager();
|
||||
clusterManager.configure({
|
||||
enabled: true,
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
bindHost: '0.0.0.0',
|
||||
gossipPort: 7946,
|
||||
advertiseUrl: 'http://control:8080',
|
||||
heartbeatIntervalMs: 5000,
|
||||
seedNodes: [],
|
||||
});
|
||||
|
||||
clusterManager.updateLocalNode({
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
endpoint: 'http://control:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
availableVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
largestGpuGroupCount: 2,
|
||||
largestGpuGroupVramGb: 48,
|
||||
deploymentCount: 1,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [1, 2],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [
|
||||
{
|
||||
modelId: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
engine: 'vllm',
|
||||
endpoint: 'http://control:8080',
|
||||
healthy: true,
|
||||
containerId: 'vllm-llama',
|
||||
},
|
||||
],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
clusterManager.upsertNode({
|
||||
nodeName: 'worker-a',
|
||||
role: 'worker',
|
||||
endpoint: 'http://worker-a:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 4,
|
||||
totalVramGb: 96,
|
||||
availableVramGb: 72,
|
||||
maxSingleGpuVramGb: 24,
|
||||
largestGpuGroupCount: 4,
|
||||
largestGpuGroupVramGb: 96,
|
||||
deploymentCount: 2,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1', 'gpu-2', 'gpu-3'],
|
||||
gpuCount: 4,
|
||||
totalVramGb: 96,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [1, 2, 3, 4],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [
|
||||
{
|
||||
modelId: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
engine: 'vllm',
|
||||
endpoint: 'http://worker-a:8080',
|
||||
healthy: true,
|
||||
containerId: 'vllm-llama-worker',
|
||||
},
|
||||
],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
const resolved = clusterManager.resolveModel('meta-llama/Llama-3.1-8B-Instruct');
|
||||
assertExists(resolved);
|
||||
assertEquals(resolved.nodeName, 'control');
|
||||
});
|
||||
|
||||
Deno.test('Cluster manager stores desired deployments', () => {
|
||||
const clusterManager = new ClusterManager();
|
||||
const desired = clusterManager.upsertDesiredDeployment('meta-llama/Llama-3.1-8B-Instruct', 3);
|
||||
|
||||
assertEquals(desired.modelId, 'meta-llama/Llama-3.1-8B-Instruct');
|
||||
assertEquals(desired.desiredReplicas, 3);
|
||||
assertEquals(clusterManager.getDesiredDeployments().length, 1);
|
||||
});
|
||||
|
||||
Deno.test('Cluster manager picks the node with enough VRAM', () => {
|
||||
const clusterManager = new ClusterManager();
|
||||
clusterManager.configure({
|
||||
enabled: true,
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
bindHost: '0.0.0.0',
|
||||
gossipPort: 7946,
|
||||
advertiseUrl: 'http://control:8080',
|
||||
heartbeatIntervalMs: 5000,
|
||||
seedNodes: [],
|
||||
});
|
||||
|
||||
clusterManager.updateLocalNode({
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
endpoint: 'http://control:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 1,
|
||||
totalVramGb: 16,
|
||||
availableVramGb: 8,
|
||||
maxSingleGpuVramGb: 8,
|
||||
largestGpuGroupCount: 1,
|
||||
largestGpuGroupVramGb: 8,
|
||||
deploymentCount: 1,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0'],
|
||||
gpuCount: 1,
|
||||
totalVramGb: 8,
|
||||
maxSingleGpuVramGb: 8,
|
||||
busNumbers: [1],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
clusterManager.upsertNode({
|
||||
nodeName: 'worker-a',
|
||||
role: 'worker',
|
||||
endpoint: 'http://worker-a:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
availableVramGb: 32,
|
||||
maxSingleGpuVramGb: 16,
|
||||
largestGpuGroupCount: 2,
|
||||
largestGpuGroupVramGb: 32,
|
||||
deploymentCount: 0,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 32,
|
||||
maxSingleGpuVramGb: 16,
|
||||
busNumbers: [1, 2],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
const selected = clusterManager.pickNodeForModel({
|
||||
id: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
engine: 'vllm',
|
||||
source: {
|
||||
repo: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
},
|
||||
capabilities: {
|
||||
chat: true,
|
||||
},
|
||||
requirements: {
|
||||
minVramGb: 18,
|
||||
minGpuCount: 1,
|
||||
},
|
||||
});
|
||||
|
||||
assertExists(selected);
|
||||
assertEquals(selected.nodeName, 'worker-a');
|
||||
});
|
||||
|
||||
Deno.test('Cluster manager excludes cordoned nodes from placement', () => {
|
||||
const clusterManager = new ClusterManager();
|
||||
clusterManager.configure({
|
||||
enabled: true,
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
bindHost: '0.0.0.0',
|
||||
gossipPort: 7946,
|
||||
advertiseUrl: 'http://control:8080',
|
||||
heartbeatIntervalMs: 5000,
|
||||
seedNodes: [],
|
||||
});
|
||||
|
||||
clusterManager.updateLocalNode({
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
endpoint: 'http://control:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
availableVramGb: 32,
|
||||
maxSingleGpuVramGb: 24,
|
||||
largestGpuGroupCount: 2,
|
||||
largestGpuGroupVramGb: 48,
|
||||
deploymentCount: 0,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [1, 2],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
clusterManager.upsertNode({
|
||||
nodeName: 'worker-a',
|
||||
role: 'worker',
|
||||
endpoint: 'http://worker-a:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
availableVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
largestGpuGroupCount: 2,
|
||||
largestGpuGroupVramGb: 48,
|
||||
deploymentCount: 0,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [1, 2],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
clusterManager.setNodeSchedulerState('worker-a', 'cordoned');
|
||||
|
||||
const selected = clusterManager.pickNodeForModel({
|
||||
id: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
engine: 'vllm',
|
||||
source: { repo: 'meta-llama/Llama-3.1-8B-Instruct' },
|
||||
capabilities: { chat: true },
|
||||
requirements: {
|
||||
minVramGb: 18,
|
||||
minGpuCount: 1,
|
||||
},
|
||||
});
|
||||
|
||||
assertExists(selected);
|
||||
assertEquals(selected.nodeName, 'control');
|
||||
});
|
||||
|
||||
Deno.test('Topology grouping keeps distant PCI buses separate', () => {
|
||||
const groups = buildGpuTopologyGroups([
|
||||
{ id: 'gpu-0', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:01:00.0', index: 0 },
|
||||
{ id: 'gpu-1', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:02:00.0', index: 1 },
|
||||
{ id: 'gpu-2', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:41:00.0', index: 2 },
|
||||
{ id: 'gpu-3', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:42:00.0', index: 3 },
|
||||
]);
|
||||
|
||||
assertEquals(groups.length, 2);
|
||||
assertEquals(groups[0].gpuCount, 2);
|
||||
assertEquals(groups[1].gpuCount, 2);
|
||||
});
|
||||
|
||||
Deno.test('Cluster manager rejects node without suitable topology group', () => {
|
||||
const clusterManager = new ClusterManager();
|
||||
clusterManager.configure({
|
||||
enabled: true,
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
bindHost: '0.0.0.0',
|
||||
gossipPort: 7946,
|
||||
advertiseUrl: 'http://control:8080',
|
||||
heartbeatIntervalMs: 5000,
|
||||
seedNodes: [],
|
||||
});
|
||||
|
||||
clusterManager.updateLocalNode({
|
||||
nodeName: 'control',
|
||||
role: 'control-plane',
|
||||
endpoint: 'http://control:8080',
|
||||
healthy: true,
|
||||
resources: {
|
||||
gpuCount: 4,
|
||||
totalVramGb: 96,
|
||||
availableVramGb: 96,
|
||||
maxSingleGpuVramGb: 24,
|
||||
largestGpuGroupCount: 2,
|
||||
largestGpuGroupVramGb: 48,
|
||||
deploymentCount: 0,
|
||||
topologyGroups: [
|
||||
{
|
||||
id: 'nvidia-1',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-0', 'gpu-1'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [1, 2],
|
||||
},
|
||||
{
|
||||
id: 'nvidia-2',
|
||||
vendor: 'nvidia',
|
||||
gpuIds: ['gpu-2', 'gpu-3'],
|
||||
gpuCount: 2,
|
||||
totalVramGb: 48,
|
||||
maxSingleGpuVramGb: 24,
|
||||
busNumbers: [65, 66],
|
||||
},
|
||||
],
|
||||
},
|
||||
deployments: [],
|
||||
lastSeenAt: Date.now(),
|
||||
});
|
||||
|
||||
const selected = clusterManager.pickNodeForModel({
|
||||
id: 'meta-llama/Llama-3.1-70B-Instruct',
|
||||
engine: 'vllm',
|
||||
source: { repo: 'meta-llama/Llama-3.1-70B-Instruct' },
|
||||
capabilities: { chat: true },
|
||||
requirements: {
|
||||
minVramGb: 72,
|
||||
minGpuCount: 4,
|
||||
},
|
||||
launchDefaults: {
|
||||
tensorParallelSize: 4,
|
||||
},
|
||||
});
|
||||
|
||||
assertEquals(selected, null);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user