feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+88
-13
@@ -5,6 +5,8 @@
|
||||
* This makes configuration easier and code more self-documenting.
|
||||
*/
|
||||
|
||||
export const VERSION = '1.0.1';
|
||||
|
||||
/**
|
||||
* Default timing values in milliseconds
|
||||
*/
|
||||
@@ -106,9 +108,6 @@ export const CONTAINER_PORTS = {
|
||||
* Container image defaults
|
||||
*/
|
||||
export const CONTAINER_IMAGES = {
|
||||
/** Ollama official image */
|
||||
OLLAMA: 'ollama/ollama:latest',
|
||||
|
||||
/** vLLM official image */
|
||||
VLLM: 'vllm/vllm-openai:latest',
|
||||
|
||||
@@ -120,20 +119,96 @@ export const CONTAINER_IMAGES = {
|
||||
* Model registry constants
|
||||
*/
|
||||
export const MODEL_REGISTRY = {
|
||||
/** Default greenlit models URL */
|
||||
DEFAULT_GREENLIST_URL:
|
||||
'https://code.foss.global/modelgrid.com/model_lists/raw/branch/main/greenlit.json',
|
||||
/** Default public catalog URL */
|
||||
DEFAULT_CATALOG_URL: 'https://list.modelgrid.com/catalog/models.json',
|
||||
|
||||
/** Fallback greenlist if remote fetch fails */
|
||||
FALLBACK_GREENLIST: [
|
||||
{ name: 'llama3.2:1b', container: 'ollama', minVram: 4 },
|
||||
{ name: 'llama3.2:3b', container: 'ollama', minVram: 6 },
|
||||
{ name: 'llama3:8b', container: 'ollama', minVram: 8 },
|
||||
{ name: 'mistral:7b', container: 'ollama', minVram: 8 },
|
||||
{ name: 'codellama:7b', container: 'ollama', minVram: 8 },
|
||||
/** Fallback catalog if remote fetch fails */
|
||||
FALLBACK_CATALOG: [
|
||||
{
|
||||
id: 'Qwen/Qwen2.5-7B-Instruct',
|
||||
aliases: ['qwen2.5-7b-instruct'],
|
||||
engine: 'vllm',
|
||||
source: {
|
||||
repo: 'Qwen/Qwen2.5-7B-Instruct',
|
||||
license: 'apache-2.0',
|
||||
},
|
||||
capabilities: {
|
||||
chat: true,
|
||||
completions: true,
|
||||
tools: true,
|
||||
},
|
||||
requirements: {
|
||||
minVramGb: 16,
|
||||
recommendedVramGb: 24,
|
||||
minGpuCount: 1,
|
||||
},
|
||||
metadata: {
|
||||
family: 'Qwen2.5',
|
||||
parameterCount: '7B',
|
||||
contextWindow: 131072,
|
||||
summary: 'General purpose instruct model for chat and tool use.',
|
||||
tags: ['chat', 'tool-use', 'instruct'],
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
aliases: ['llama-3.1-8b-instruct'],
|
||||
engine: 'vllm',
|
||||
source: {
|
||||
repo: 'meta-llama/Llama-3.1-8B-Instruct',
|
||||
license: 'llama3.1',
|
||||
},
|
||||
capabilities: {
|
||||
chat: true,
|
||||
completions: true,
|
||||
tools: true,
|
||||
},
|
||||
requirements: {
|
||||
minVramGb: 18,
|
||||
recommendedVramGb: 24,
|
||||
minGpuCount: 1,
|
||||
},
|
||||
metadata: {
|
||||
family: 'Llama 3.1',
|
||||
parameterCount: '8B',
|
||||
contextWindow: 131072,
|
||||
summary: 'High quality instruct model with good ecosystem support.',
|
||||
tags: ['chat', 'tool-use', 'instruct'],
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'BAAI/bge-m3',
|
||||
aliases: ['bge-m3'],
|
||||
engine: 'vllm',
|
||||
source: {
|
||||
repo: 'BAAI/bge-m3',
|
||||
license: 'mit',
|
||||
},
|
||||
capabilities: {
|
||||
embeddings: true,
|
||||
},
|
||||
requirements: {
|
||||
minVramGb: 8,
|
||||
recommendedVramGb: 12,
|
||||
minGpuCount: 1,
|
||||
},
|
||||
metadata: {
|
||||
family: 'BGE',
|
||||
summary: 'Multilingual embedding model for retrieval workloads.',
|
||||
tags: ['embeddings', 'retrieval', 'multilingual'],
|
||||
},
|
||||
},
|
||||
],
|
||||
} as const;
|
||||
|
||||
export const CLUSTER = {
|
||||
DEFAULT_BIND_HOST: '0.0.0.0',
|
||||
DEFAULT_GOSSIP_PORT: 7946,
|
||||
DEFAULT_HEARTBEAT_INTERVAL_MS: 5000,
|
||||
NODE_STALE_AFTER_MS: 20000,
|
||||
AUTH_HEADER_NAME: 'x-modelgrid-cluster-secret',
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Configuration paths
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user