388 lines
9.8 KiB
TypeScript
388 lines
9.8 KiB
TypeScript
|
|
/**
|
||
|
|
* Ollama Container
|
||
|
|
*
|
||
|
|
* Manages Ollama containers for running local LLMs.
|
||
|
|
*/
|
||
|
|
|
||
|
|
import type {
|
||
|
|
IContainerConfig,
|
||
|
|
ILoadedModel,
|
||
|
|
TContainerType,
|
||
|
|
} from '../interfaces/container.ts';
|
||
|
|
import type {
|
||
|
|
IChatCompletionRequest,
|
||
|
|
IChatCompletionResponse,
|
||
|
|
IChatCompletionChoice,
|
||
|
|
IChatMessage,
|
||
|
|
} from '../interfaces/api.ts';
|
||
|
|
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||
|
|
import { logger } from '../logger.ts';
|
||
|
|
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Ollama API response types
|
||
|
|
*/
|
||
|
|
interface IOllamaTagsResponse {
|
||
|
|
models: Array<{
|
||
|
|
name: string;
|
||
|
|
size: number;
|
||
|
|
digest: string;
|
||
|
|
modified_at: string;
|
||
|
|
}>;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface IOllamaChatRequest {
|
||
|
|
model: string;
|
||
|
|
messages: Array<{
|
||
|
|
role: string;
|
||
|
|
content: string;
|
||
|
|
}>;
|
||
|
|
stream?: boolean;
|
||
|
|
options?: {
|
||
|
|
temperature?: number;
|
||
|
|
top_p?: number;
|
||
|
|
num_predict?: number;
|
||
|
|
stop?: string[];
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
interface IOllamaChatResponse {
|
||
|
|
model: string;
|
||
|
|
created_at: string;
|
||
|
|
message: {
|
||
|
|
role: string;
|
||
|
|
content: string;
|
||
|
|
};
|
||
|
|
done: boolean;
|
||
|
|
total_duration?: number;
|
||
|
|
load_duration?: number;
|
||
|
|
prompt_eval_count?: number;
|
||
|
|
eval_count?: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface IOllamaPullResponse {
|
||
|
|
status: string;
|
||
|
|
digest?: string;
|
||
|
|
total?: number;
|
||
|
|
completed?: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Ollama container implementation
|
||
|
|
*/
|
||
|
|
export class OllamaContainer extends BaseContainer {
|
||
|
|
public readonly type: TContainerType = 'ollama';
|
||
|
|
public readonly displayName = 'Ollama';
|
||
|
|
public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
|
||
|
|
public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
|
||
|
|
|
||
|
|
constructor(config: IContainerConfig) {
|
||
|
|
super(config);
|
||
|
|
|
||
|
|
// Set defaults if not provided
|
||
|
|
if (!config.image) {
|
||
|
|
config.image = this.defaultImage;
|
||
|
|
}
|
||
|
|
if (!config.port) {
|
||
|
|
config.port = this.defaultPort;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add default volume for model storage
|
||
|
|
if (!config.volumes || config.volumes.length === 0) {
|
||
|
|
config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Create Ollama container configuration
|
||
|
|
*/
|
||
|
|
public static createConfig(
|
||
|
|
id: string,
|
||
|
|
name: string,
|
||
|
|
gpuIds: string[],
|
||
|
|
options: Partial<IContainerConfig> = {},
|
||
|
|
): IContainerConfig {
|
||
|
|
return {
|
||
|
|
id,
|
||
|
|
name,
|
||
|
|
type: 'ollama',
|
||
|
|
image: options.image || CONTAINER_IMAGES.OLLAMA,
|
||
|
|
gpuIds,
|
||
|
|
port: options.port || CONTAINER_PORTS.OLLAMA,
|
||
|
|
externalPort: options.externalPort,
|
||
|
|
models: options.models || [],
|
||
|
|
env: options.env,
|
||
|
|
volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
|
||
|
|
autoStart: options.autoStart ?? true,
|
||
|
|
restartPolicy: options.restartPolicy || 'unless-stopped',
|
||
|
|
memoryLimit: options.memoryLimit,
|
||
|
|
cpuLimit: options.cpuLimit,
|
||
|
|
command: options.command,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if Ollama is healthy
|
||
|
|
*/
|
||
|
|
public async isHealthy(): Promise<boolean> {
|
||
|
|
try {
|
||
|
|
const response = await this.fetch('/api/tags', { timeout: 5000 });
|
||
|
|
return response.ok;
|
||
|
|
} catch {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* List available models
|
||
|
|
*/
|
||
|
|
public async listModels(): Promise<string[]> {
|
||
|
|
try {
|
||
|
|
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||
|
|
return (data.models || []).map((m) => m.name);
|
||
|
|
} catch (error) {
|
||
|
|
logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get loaded models with details
|
||
|
|
*/
|
||
|
|
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||
|
|
try {
|
||
|
|
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||
|
|
return (data.models || []).map((m) => ({
|
||
|
|
name: m.name,
|
||
|
|
size: m.size,
|
||
|
|
format: m.digest.substring(0, 12),
|
||
|
|
loaded: true, // Ollama doesn't distinguish loaded vs available
|
||
|
|
requestCount: 0,
|
||
|
|
}));
|
||
|
|
} catch {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Pull a model
|
||
|
|
*/
|
||
|
|
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
|
||
|
|
try {
|
||
|
|
logger.info(`Pulling model: ${modelName}`);
|
||
|
|
|
||
|
|
const response = await this.fetch('/api/pull', {
|
||
|
|
method: 'POST',
|
||
|
|
body: { name: modelName },
|
||
|
|
timeout: 3600000, // 1 hour for large models
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
throw new Error(`HTTP ${response.status}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Read streaming response
|
||
|
|
const reader = response.body?.getReader();
|
||
|
|
if (!reader) {
|
||
|
|
throw new Error('No response body');
|
||
|
|
}
|
||
|
|
|
||
|
|
const decoder = new TextDecoder();
|
||
|
|
let lastStatus = '';
|
||
|
|
|
||
|
|
while (true) {
|
||
|
|
const { done, value } = await reader.read();
|
||
|
|
if (done) break;
|
||
|
|
|
||
|
|
const text = decoder.decode(value);
|
||
|
|
const lines = text.split('\n').filter((l) => l.trim());
|
||
|
|
|
||
|
|
for (const line of lines) {
|
||
|
|
try {
|
||
|
|
const data = JSON.parse(line) as IOllamaPullResponse;
|
||
|
|
const status = data.status;
|
||
|
|
|
||
|
|
if (status !== lastStatus) {
|
||
|
|
lastStatus = status;
|
||
|
|
let percent: number | undefined;
|
||
|
|
|
||
|
|
if (data.total && data.completed) {
|
||
|
|
percent = Math.round((data.completed / data.total) * 100);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (onProgress) {
|
||
|
|
onProgress({ model: modelName, status, percent });
|
||
|
|
} else {
|
||
|
|
const progressStr = percent !== undefined ? ` (${percent}%)` : '';
|
||
|
|
logger.dim(` ${status}${progressStr}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
// Invalid JSON line, skip
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.success(`Model ${modelName} pulled successfully`);
|
||
|
|
return true;
|
||
|
|
} catch (error) {
|
||
|
|
logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Remove a model
|
||
|
|
*/
|
||
|
|
public async removeModel(modelName: string): Promise<boolean> {
|
||
|
|
try {
|
||
|
|
const response = await this.fetch('/api/delete', {
|
||
|
|
method: 'DELETE',
|
||
|
|
body: { name: modelName },
|
||
|
|
});
|
||
|
|
|
||
|
|
if (response.ok) {
|
||
|
|
logger.success(`Model ${modelName} removed`);
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
throw new Error(`HTTP ${response.status}`);
|
||
|
|
} catch (error) {
|
||
|
|
logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Send a chat completion request
|
||
|
|
*/
|
||
|
|
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
|
||
|
|
const ollamaRequest: IOllamaChatRequest = {
|
||
|
|
model: request.model,
|
||
|
|
messages: request.messages.map((m) => ({
|
||
|
|
role: m.role,
|
||
|
|
content: m.content,
|
||
|
|
})),
|
||
|
|
stream: false,
|
||
|
|
options: {
|
||
|
|
temperature: request.temperature,
|
||
|
|
top_p: request.top_p,
|
||
|
|
num_predict: request.max_tokens,
|
||
|
|
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||
|
|
},
|
||
|
|
};
|
||
|
|
|
||
|
|
const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
|
||
|
|
method: 'POST',
|
||
|
|
body: ollamaRequest,
|
||
|
|
timeout: 300000, // 5 minutes
|
||
|
|
});
|
||
|
|
|
||
|
|
// Convert to OpenAI format
|
||
|
|
const created = Math.floor(Date.now() / 1000);
|
||
|
|
|
||
|
|
const choice: IChatCompletionChoice = {
|
||
|
|
index: 0,
|
||
|
|
message: {
|
||
|
|
role: 'assistant',
|
||
|
|
content: response.message.content,
|
||
|
|
},
|
||
|
|
finish_reason: response.done ? 'stop' : null,
|
||
|
|
};
|
||
|
|
|
||
|
|
return {
|
||
|
|
id: this.generateRequestId(),
|
||
|
|
object: 'chat.completion',
|
||
|
|
created,
|
||
|
|
model: request.model,
|
||
|
|
choices: [choice],
|
||
|
|
usage: {
|
||
|
|
prompt_tokens: response.prompt_eval_count || 0,
|
||
|
|
completion_tokens: response.eval_count || 0,
|
||
|
|
total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
|
||
|
|
},
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Stream a chat completion request
|
||
|
|
*/
|
||
|
|
public async chatCompletionStream(
|
||
|
|
request: IChatCompletionRequest,
|
||
|
|
onChunk: (chunk: string) => void,
|
||
|
|
): Promise<void> {
|
||
|
|
const ollamaRequest: IOllamaChatRequest = {
|
||
|
|
model: request.model,
|
||
|
|
messages: request.messages.map((m) => ({
|
||
|
|
role: m.role,
|
||
|
|
content: m.content,
|
||
|
|
})),
|
||
|
|
stream: true,
|
||
|
|
options: {
|
||
|
|
temperature: request.temperature,
|
||
|
|
top_p: request.top_p,
|
||
|
|
num_predict: request.max_tokens,
|
||
|
|
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||
|
|
},
|
||
|
|
};
|
||
|
|
|
||
|
|
const response = await this.fetch('/api/chat', {
|
||
|
|
method: 'POST',
|
||
|
|
body: ollamaRequest,
|
||
|
|
timeout: 300000,
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
throw new Error(`HTTP ${response.status}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const reader = response.body?.getReader();
|
||
|
|
if (!reader) {
|
||
|
|
throw new Error('No response body');
|
||
|
|
}
|
||
|
|
|
||
|
|
const decoder = new TextDecoder();
|
||
|
|
const requestId = this.generateRequestId();
|
||
|
|
const created = Math.floor(Date.now() / 1000);
|
||
|
|
|
||
|
|
while (true) {
|
||
|
|
const { done, value } = await reader.read();
|
||
|
|
if (done) break;
|
||
|
|
|
||
|
|
const text = decoder.decode(value);
|
||
|
|
const lines = text.split('\n').filter((l) => l.trim());
|
||
|
|
|
||
|
|
for (const line of lines) {
|
||
|
|
try {
|
||
|
|
const data = JSON.parse(line) as IOllamaChatResponse;
|
||
|
|
|
||
|
|
// Convert to OpenAI streaming format
|
||
|
|
const chunk = {
|
||
|
|
id: requestId,
|
||
|
|
object: 'chat.completion.chunk',
|
||
|
|
created,
|
||
|
|
model: request.model,
|
||
|
|
choices: [
|
||
|
|
{
|
||
|
|
index: 0,
|
||
|
|
delta: {
|
||
|
|
content: data.message.content,
|
||
|
|
} as Partial<IChatMessage>,
|
||
|
|
finish_reason: data.done ? 'stop' : null,
|
||
|
|
},
|
||
|
|
],
|
||
|
|
};
|
||
|
|
|
||
|
|
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
|
||
|
|
|
||
|
|
if (data.done) {
|
||
|
|
onChunk('data: [DONE]\n\n');
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
// Invalid JSON, skip
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|