feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -6,14 +6,13 @@
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
IContainerStatus,
|
||||
IContainerEndpoint,
|
||||
IContainerStatus,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { DockerManager } from '../docker/docker-manager.ts';
|
||||
import { BaseContainer } from './base-container.ts';
|
||||
import { OllamaContainer } from './ollama.ts';
|
||||
import { VllmContainer } from './vllm.ts';
|
||||
import { TgiContainer } from './tgi.ts';
|
||||
|
||||
@@ -47,8 +46,6 @@ export class ContainerManager {
|
||||
*/
|
||||
private createContainerInstance(config: IContainerConfig): BaseContainer {
|
||||
switch (config.type) {
|
||||
case 'ollama':
|
||||
return new OllamaContainer(config);
|
||||
case 'vllm':
|
||||
return new VllmContainer(config);
|
||||
case 'tgi':
|
||||
@@ -108,7 +105,11 @@ export class ContainerManager {
|
||||
try {
|
||||
this.addContainer(config);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to load container ${config.id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Failed to load container ${config.id}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -128,7 +129,11 @@ export class ContainerManager {
|
||||
const success = await container.start();
|
||||
results.set(id, success);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to start container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to start container ${id}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
results.set(id, false);
|
||||
}
|
||||
}
|
||||
@@ -147,7 +152,11 @@ export class ContainerManager {
|
||||
const success = await container.stop();
|
||||
results.set(id, success);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to stop container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to stop container ${id}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
results.set(id, false);
|
||||
}
|
||||
}
|
||||
@@ -166,7 +175,11 @@ export class ContainerManager {
|
||||
const status = await container.getStatus();
|
||||
statuses.set(id, status);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to get status for container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Failed to get status for container ${id}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,7 +279,7 @@ export class ContainerManager {
|
||||
*/
|
||||
public async pullModel(
|
||||
modelName: string,
|
||||
containerType: TContainerType = 'ollama',
|
||||
containerType: TContainerType = 'vllm',
|
||||
containerId?: string,
|
||||
): Promise<boolean> {
|
||||
// Find or create appropriate container
|
||||
@@ -313,6 +326,16 @@ export class ContainerManager {
|
||||
return results;
|
||||
}
|
||||
|
||||
public async checkAllHealth(): Promise<boolean> {
|
||||
const results = await this.healthCheck();
|
||||
|
||||
if (results.size === 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return Array.from(results.values()).every((healthy) => healthy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Print container status summary
|
||||
*/
|
||||
@@ -329,9 +352,7 @@ export class ContainerManager {
|
||||
for (const [id, status] of statuses) {
|
||||
const runningStr = status.running ? 'Running' : 'Stopped';
|
||||
const healthStr = status.health;
|
||||
const modelsStr = status.loadedModels.length > 0
|
||||
? status.loadedModels.join(', ')
|
||||
: 'None';
|
||||
const modelsStr = status.loadedModels.length > 0 ? status.loadedModels.join(', ') : 'None';
|
||||
|
||||
logger.logBoxLine(`${status.name} (${id})`);
|
||||
logger.logBoxLine(` Type: ${status.type} | Status: ${runningStr} | Health: ${healthStr}`);
|
||||
@@ -339,7 +360,9 @@ export class ContainerManager {
|
||||
logger.logBoxLine(` Endpoint: ${status.endpoint}`);
|
||||
|
||||
if (status.gpuUtilization !== undefined) {
|
||||
logger.logBoxLine(` GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`);
|
||||
logger.logBoxLine(
|
||||
` GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`,
|
||||
);
|
||||
}
|
||||
logger.logBoxLine('');
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
*/
|
||||
|
||||
export { BaseContainer } from './base-container.ts';
|
||||
export { OllamaContainer } from './ollama.ts';
|
||||
export { VllmContainer } from './vllm.ts';
|
||||
export { TgiContainer } from './tgi.ts';
|
||||
export { ContainerManager } from './container-manager.ts';
|
||||
|
||||
@@ -1,387 +0,0 @@
|
||||
/**
|
||||
* Ollama Container
|
||||
*
|
||||
* Manages Ollama containers for running local LLMs.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IChatCompletionChoice,
|
||||
IChatMessage,
|
||||
} from '../interfaces/api.ts';
|
||||
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
|
||||
|
||||
/**
|
||||
* Ollama API response types
|
||||
*/
|
||||
interface IOllamaTagsResponse {
|
||||
models: Array<{
|
||||
name: string;
|
||||
size: number;
|
||||
digest: string;
|
||||
modified_at: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
interface IOllamaChatRequest {
|
||||
model: string;
|
||||
messages: Array<{
|
||||
role: string;
|
||||
content: string;
|
||||
}>;
|
||||
stream?: boolean;
|
||||
options?: {
|
||||
temperature?: number;
|
||||
top_p?: number;
|
||||
num_predict?: number;
|
||||
stop?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
interface IOllamaChatResponse {
|
||||
model: string;
|
||||
created_at: string;
|
||||
message: {
|
||||
role: string;
|
||||
content: string;
|
||||
};
|
||||
done: boolean;
|
||||
total_duration?: number;
|
||||
load_duration?: number;
|
||||
prompt_eval_count?: number;
|
||||
eval_count?: number;
|
||||
}
|
||||
|
||||
interface IOllamaPullResponse {
|
||||
status: string;
|
||||
digest?: string;
|
||||
total?: number;
|
||||
completed?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ollama container implementation
|
||||
*/
|
||||
export class OllamaContainer extends BaseContainer {
|
||||
public readonly type: TContainerType = 'ollama';
|
||||
public readonly displayName = 'Ollama';
|
||||
public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
|
||||
public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
|
||||
|
||||
constructor(config: IContainerConfig) {
|
||||
super(config);
|
||||
|
||||
// Set defaults if not provided
|
||||
if (!config.image) {
|
||||
config.image = this.defaultImage;
|
||||
}
|
||||
if (!config.port) {
|
||||
config.port = this.defaultPort;
|
||||
}
|
||||
|
||||
// Add default volume for model storage
|
||||
if (!config.volumes || config.volumes.length === 0) {
|
||||
config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Ollama container configuration
|
||||
*/
|
||||
public static createConfig(
|
||||
id: string,
|
||||
name: string,
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
return {
|
||||
id,
|
||||
name,
|
||||
type: 'ollama',
|
||||
image: options.image || CONTAINER_IMAGES.OLLAMA,
|
||||
gpuIds,
|
||||
port: options.port || CONTAINER_PORTS.OLLAMA,
|
||||
externalPort: options.externalPort,
|
||||
models: options.models || [],
|
||||
env: options.env,
|
||||
volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
|
||||
autoStart: options.autoStart ?? true,
|
||||
restartPolicy: options.restartPolicy || 'unless-stopped',
|
||||
memoryLimit: options.memoryLimit,
|
||||
cpuLimit: options.cpuLimit,
|
||||
command: options.command,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Ollama is healthy
|
||||
*/
|
||||
public async isHealthy(): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/api/tags', { timeout: 5000 });
|
||||
return response.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List available models
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||||
return (data.models || []).map((m) => m.name);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||||
return (data.models || []).map((m) => ({
|
||||
name: m.name,
|
||||
size: m.size,
|
||||
format: m.digest.substring(0, 12),
|
||||
loaded: true, // Ollama doesn't distinguish loaded vs available
|
||||
requestCount: 0,
|
||||
}));
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a model
|
||||
*/
|
||||
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
|
||||
try {
|
||||
logger.info(`Pulling model: ${modelName}`);
|
||||
|
||||
const response = await this.fetch('/api/pull', {
|
||||
method: 'POST',
|
||||
body: { name: modelName },
|
||||
timeout: 3600000, // 1 hour for large models
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
// Read streaming response
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let lastStatus = '';
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
const lines = text.split('\n').filter((l) => l.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const data = JSON.parse(line) as IOllamaPullResponse;
|
||||
const status = data.status;
|
||||
|
||||
if (status !== lastStatus) {
|
||||
lastStatus = status;
|
||||
let percent: number | undefined;
|
||||
|
||||
if (data.total && data.completed) {
|
||||
percent = Math.round((data.completed / data.total) * 100);
|
||||
}
|
||||
|
||||
if (onProgress) {
|
||||
onProgress({ model: modelName, status, percent });
|
||||
} else {
|
||||
const progressStr = percent !== undefined ? ` (${percent}%)` : '';
|
||||
logger.dim(` ${status}${progressStr}`);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Invalid JSON line, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.success(`Model ${modelName} pulled successfully`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a model
|
||||
*/
|
||||
public async removeModel(modelName: string): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/api/delete', {
|
||||
method: 'DELETE',
|
||||
body: { name: modelName },
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
logger.success(`Model ${modelName} removed`);
|
||||
return true;
|
||||
}
|
||||
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a chat completion request
|
||||
*/
|
||||
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
|
||||
const ollamaRequest: IOllamaChatRequest = {
|
||||
model: request.model,
|
||||
messages: request.messages.map((m) => ({
|
||||
role: m.role,
|
||||
content: m.content,
|
||||
})),
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
num_predict: request.max_tokens,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
|
||||
method: 'POST',
|
||||
body: ollamaRequest,
|
||||
timeout: 300000, // 5 minutes
|
||||
});
|
||||
|
||||
// Convert to OpenAI format
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
|
||||
const choice: IChatCompletionChoice = {
|
||||
index: 0,
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: response.message.content,
|
||||
},
|
||||
finish_reason: response.done ? 'stop' : null,
|
||||
};
|
||||
|
||||
return {
|
||||
id: this.generateRequestId(),
|
||||
object: 'chat.completion',
|
||||
created,
|
||||
model: request.model,
|
||||
choices: [choice],
|
||||
usage: {
|
||||
prompt_tokens: response.prompt_eval_count || 0,
|
||||
completion_tokens: response.eval_count || 0,
|
||||
total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream a chat completion request
|
||||
*/
|
||||
public async chatCompletionStream(
|
||||
request: IChatCompletionRequest,
|
||||
onChunk: (chunk: string) => void,
|
||||
): Promise<void> {
|
||||
const ollamaRequest: IOllamaChatRequest = {
|
||||
model: request.model,
|
||||
messages: request.messages.map((m) => ({
|
||||
role: m.role,
|
||||
content: m.content,
|
||||
})),
|
||||
stream: true,
|
||||
options: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
num_predict: request.max_tokens,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await this.fetch('/api/chat', {
|
||||
method: 'POST',
|
||||
body: ollamaRequest,
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
const requestId = this.generateRequestId();
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
const lines = text.split('\n').filter((l) => l.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const data = JSON.parse(line) as IOllamaChatResponse;
|
||||
|
||||
// Convert to OpenAI streaming format
|
||||
const chunk = {
|
||||
id: requestId,
|
||||
object: 'chat.completion.chunk',
|
||||
created,
|
||||
model: request.model,
|
||||
choices: [
|
||||
{
|
||||
index: 0,
|
||||
delta: {
|
||||
content: data.message.content,
|
||||
} as Partial<IChatMessage>,
|
||||
finish_reason: data.done ? 'stop' : null,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
|
||||
|
||||
if (data.done) {
|
||||
onChunk('data: [DONE]\n\n');
|
||||
}
|
||||
} catch {
|
||||
// Invalid JSON, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
+15
-9
@@ -4,15 +4,11 @@
|
||||
* Manages HuggingFace Text Generation Inference containers.
|
||||
*/
|
||||
|
||||
import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionChoice,
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IChatCompletionChoice,
|
||||
IChatMessage,
|
||||
} from '../interfaces/api.ts';
|
||||
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||||
@@ -161,7 +157,9 @@ export class TgiContainer extends BaseContainer {
|
||||
const info = await this.fetchJson<ITgiInfoResponse>('/info');
|
||||
return [info.model_id];
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
return this.config.models || [];
|
||||
}
|
||||
}
|
||||
@@ -232,7 +230,11 @@ export class TgiContainer extends BaseContainer {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
max_new_tokens: request.max_tokens || 1024,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
stop: Array.isArray(request.stop)
|
||||
? request.stop
|
||||
: request.stop
|
||||
? [request.stop]
|
||||
: undefined,
|
||||
do_sample: (request.temperature || 0) > 0,
|
||||
return_full_text: false,
|
||||
},
|
||||
@@ -288,7 +290,11 @@ export class TgiContainer extends BaseContainer {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
max_new_tokens: request.max_tokens || 1024,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
stop: Array.isArray(request.stop)
|
||||
? request.stop
|
||||
: request.stop
|
||||
? [request.stop]
|
||||
: undefined,
|
||||
do_sample: (request.temperature || 0) > 0,
|
||||
},
|
||||
},
|
||||
|
||||
+30
-13
@@ -4,11 +4,7 @@
|
||||
* Manages vLLM containers for high-performance LLM inference.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
@@ -72,20 +68,26 @@ export class VllmContainer extends BaseContainer {
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
// vLLM requires model to be specified at startup
|
||||
const command = [
|
||||
'--model', modelName,
|
||||
'--host', '0.0.0.0',
|
||||
'--port', String(options.port || CONTAINER_PORTS.VLLM),
|
||||
const command = options.command ? [...options.command] : [
|
||||
'--model',
|
||||
modelName,
|
||||
];
|
||||
|
||||
if (!command.includes('--host')) {
|
||||
command.push('--host', '0.0.0.0');
|
||||
}
|
||||
|
||||
if (!command.includes('--port')) {
|
||||
command.push('--port', String(options.port || CONTAINER_PORTS.VLLM));
|
||||
}
|
||||
|
||||
// Add tensor parallelism if multiple GPUs
|
||||
if (gpuIds.length > 1) {
|
||||
if (gpuIds.length > 1 && !command.includes('--tensor-parallel-size')) {
|
||||
command.push('--tensor-parallel-size', String(gpuIds.length));
|
||||
}
|
||||
|
||||
// Add additional options
|
||||
if (options.env?.VLLM_MAX_MODEL_LEN) {
|
||||
if (options.env?.VLLM_MAX_MODEL_LEN && !command.includes('--max-model-len')) {
|
||||
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
|
||||
}
|
||||
|
||||
@@ -128,11 +130,17 @@ export class VllmContainer extends BaseContainer {
|
||||
* vLLM serves a single model per instance
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
if (this.config.models.length > 0) {
|
||||
return this.config.models;
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => m.id);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
return this.config.models || [];
|
||||
}
|
||||
}
|
||||
@@ -141,6 +149,15 @@ export class VllmContainer extends BaseContainer {
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
if (this.config.models.length > 0) {
|
||||
return this.config.models.map((name) => ({
|
||||
name,
|
||||
size: 0,
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => ({
|
||||
|
||||
Reference in New Issue
Block a user