feat(providers): Add vision and document processing capabilities to providers

This commit is contained in:
2025-02-03 15:26:00 +01:00
parent e82c510094
commit eda8ce36df
9 changed files with 212 additions and 6 deletions

View File

@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@push.rocks/smartai',
version: '0.0.19',
version: '0.1.0',
description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
}

View File

@ -62,4 +62,25 @@ export abstract class MultiModalModel {
* @throws Error if the provider doesn't support audio generation
*/
public abstract audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream>;
/**
* Vision-language processing
* @param optionsArg Options containing the image and prompt for analysis
* @returns Promise resolving to the model's description or analysis of the image
* @throws Error if the provider doesn't support vision tasks
*/
public abstract vision(optionsArg: { image: Buffer; prompt: string }): Promise<string>;
/**
* Document analysis and processing
* @param optionsArg Options containing system message, user message, PDF documents, and message history
* @returns Promise resolving to the model's analysis of the documents
* @throws Error if the provider doesn't support document processing
*/
public abstract document(optionsArg: {
systemMessage: string;
userMessage: string;
pdfDocuments: Uint8Array[];
messageHistory: ChatMessage[];
}): Promise<{ message: any }>;
}

View File

@ -130,4 +130,17 @@ export class AnthropicProvider extends MultiModalModel {
// Anthropic does not provide an audio API, so this method is not implemented.
throw new Error('Audio generation is not yet supported by Anthropic.');
}
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
throw new Error('Vision tasks are not yet supported by Anthropic.');
}
public async document(optionsArg: {
systemMessage: string;
userMessage: string;
pdfDocuments: Uint8Array[];
messageHistory: ChatMessage[];
}): Promise<{ message: any }> {
throw new Error('Document processing is not yet supported by Anthropic.');
}
}

View File

@ -176,4 +176,17 @@ export class GroqProvider extends MultiModalModel {
// Groq does not provide an audio API, so this method is not implemented.
throw new Error('Audio generation is not yet supported by Groq.');
}
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
throw new Error('Vision tasks are not yet supported by Groq.');
}
public async document(optionsArg: {
systemMessage: string;
userMessage: string;
pdfDocuments: Uint8Array[];
messageHistory: ChatMessage[];
}): Promise<{ message: any }> {
throw new Error('Document processing is not yet supported by Groq.');
}
}

View File

@ -6,18 +6,21 @@ import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.
export interface IOllamaProviderOptions {
baseUrl?: string;
model?: string;
visionModel?: string; // Model to use for vision tasks (e.g. 'llava')
}
export class OllamaProvider extends MultiModalModel {
private options: IOllamaProviderOptions;
private baseUrl: string;
private model: string;
private visionModel: string;
constructor(optionsArg: IOllamaProviderOptions = {}) {
super();
this.options = optionsArg;
this.baseUrl = optionsArg.baseUrl || 'http://localhost:11434';
this.model = optionsArg.model || 'llama2';
this.visionModel = optionsArg.visionModel || 'llava';
}
async start() {
@ -167,4 +170,83 @@ export class OllamaProvider extends MultiModalModel {
public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
throw new Error('Audio generation is not supported by Ollama.');
}
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
const base64Image = optionsArg.image.toString('base64');
const response = await fetch(`${this.baseUrl}/api/chat`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: this.visionModel,
messages: [{
role: 'user',
content: optionsArg.prompt,
images: [base64Image]
}],
stream: false
}),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.statusText}`);
}
const result = await response.json();
return result.message.content;
}
public async document(optionsArg: {
systemMessage: string;
userMessage: string;
pdfDocuments: Uint8Array[];
messageHistory: ChatMessage[];
}): Promise<{ message: any }> {
// Convert PDF documents to images using SmartPDF
const smartpdfInstance = new plugins.smartpdf.SmartPdf();
let documentImageBytesArray: Uint8Array[] = [];
for (const pdfDocument of optionsArg.pdfDocuments) {
const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
}
// Convert images to base64
const base64Images = documentImageBytesArray.map(bytes => Buffer.from(bytes).toString('base64'));
// Send request to Ollama with images
const response = await fetch(`${this.baseUrl}/api/chat`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: this.visionModel,
messages: [
{ role: 'system', content: optionsArg.systemMessage },
...optionsArg.messageHistory,
{
role: 'user',
content: optionsArg.userMessage,
images: base64Images
}
],
stream: false
}),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.statusText}`);
}
const result = await response.json();
return {
message: {
role: 'assistant',
content: result.message.content
}
};
}
}

View File

@ -192,4 +192,27 @@ export class OpenAiProvider extends MultiModalModel {
message: result.choices[0].message,
};
}
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
const result = await this.openAiApiClient.chat.completions.create({
model: 'gpt-4-vision-preview',
messages: [
{
role: 'user',
content: [
{ type: 'text', text: optionsArg.prompt },
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${optionsArg.image.toString('base64')}`
}
}
]
}
],
max_tokens: 300
});
return result.choices[0].message.content || '';
}
}

View File

@ -155,4 +155,17 @@ export class PerplexityProvider extends MultiModalModel {
public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
throw new Error('Audio generation is not supported by Perplexity.');
}
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
throw new Error('Vision tasks are not supported by Perplexity.');
}
public async document(optionsArg: {
systemMessage: string;
userMessage: string;
pdfDocuments: Uint8Array[];
messageHistory: ChatMessage[];
}): Promise<{ message: any }> {
throw new Error('Document processing is not supported by Perplexity.');
}
}