diff --git a/changelog.md b/changelog.md index 42c6673..313b534 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,13 @@ # Changelog +## 2025-02-03 - 0.1.0 - feat(providers) +Add vision and document processing capabilities to providers + +- OpenAI and Ollama providers now support vision tasks using GPT-4 Vision and Llava models respectively. +- Document processing has been implemented for OpenAI and Ollama providers, converting PDFs to images for analysis. +- Introduced abstract methods for vision and document processing in the MultiModalModel class. +- Updated the readme file with examples for vision and document processing. + ## 2025-02-03 - 0.0.19 - fix(core) Enhanced chat streaming and error handling across providers diff --git a/readme.md b/readme.md index b8c2b35..472e030 100644 --- a/readme.md +++ b/readme.md @@ -17,8 +17,8 @@ This command installs the package and adds it to your project's dependencies. @push.rocks/smartai supports multiple AI providers, each with its own unique capabilities: ### OpenAI -- Models: GPT-4, GPT-3.5-turbo -- Features: Chat, Streaming, Audio Generation +- Models: GPT-4, GPT-3.5-turbo, GPT-4-vision-preview +- Features: Chat, Streaming, Audio Generation, Vision, Document Processing - Configuration: ```typescript openaiToken: 'your-openai-token' @@ -49,12 +49,13 @@ This command installs the package and adds it to your project's dependencies. ``` ### Ollama -- Models: Configurable (default: llama2) -- Features: Chat, Streaming +- Models: Configurable (default: llama2, llava for vision/documents) +- Features: Chat, Streaming, Vision, Document Processing - Configuration: ```typescript baseUrl: 'http://localhost:11434' // Optional model: 'llama2' // Optional + visionModel: 'llava' // Optional, for vision and document tasks ``` ## Usage @@ -147,15 +148,47 @@ const audioStream = await smartAi.openaiProvider.audio({ ### Document Processing -For providers that support document processing (currently OpenAI): +For providers that support document processing (OpenAI and Ollama): ```typescript +// Using OpenAI const result = await smartAi.openaiProvider.document({ systemMessage: 'Classify the document type', userMessage: 'What type of document is this?', messageHistory: [], pdfDocuments: [pdfBuffer] // Uint8Array of PDF content }); + +// Using Ollama with llava +const analysis = await smartAi.ollamaProvider.document({ + systemMessage: 'You are a document analysis assistant', + userMessage: 'Extract the key information from this document', + messageHistory: [], + pdfDocuments: [pdfBuffer] // Uint8Array of PDF content +}); +``` + +Both providers will: +1. Convert PDF documents to images +2. Process each page using their vision models +3. Return a comprehensive analysis based on the system message and user query + +### Vision Processing + +For providers that support vision tasks (OpenAI and Ollama): + +```typescript +// Using OpenAI's GPT-4 Vision +const description = await smartAi.openaiProvider.vision({ + image: imageBuffer, // Buffer containing the image data + prompt: 'What do you see in this image?' +}); + +// Using Ollama's Llava model +const analysis = await smartAi.ollamaProvider.vision({ + image: imageBuffer, + prompt: 'Analyze this image in detail' +}); ``` ## Error Handling diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index b8f9017..6db5558 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@push.rocks/smartai', - version: '0.0.19', + version: '0.1.0', description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.' } diff --git a/ts/abstract.classes.multimodal.ts b/ts/abstract.classes.multimodal.ts index a827ba7..de8f099 100644 --- a/ts/abstract.classes.multimodal.ts +++ b/ts/abstract.classes.multimodal.ts @@ -62,4 +62,25 @@ export abstract class MultiModalModel { * @throws Error if the provider doesn't support audio generation */ public abstract audio(optionsArg: { message: string }): Promise; + + /** + * Vision-language processing + * @param optionsArg Options containing the image and prompt for analysis + * @returns Promise resolving to the model's description or analysis of the image + * @throws Error if the provider doesn't support vision tasks + */ + public abstract vision(optionsArg: { image: Buffer; prompt: string }): Promise; + + /** + * Document analysis and processing + * @param optionsArg Options containing system message, user message, PDF documents, and message history + * @returns Promise resolving to the model's analysis of the documents + * @throws Error if the provider doesn't support document processing + */ + public abstract document(optionsArg: { + systemMessage: string; + userMessage: string; + pdfDocuments: Uint8Array[]; + messageHistory: ChatMessage[]; + }): Promise<{ message: any }>; } diff --git a/ts/provider.anthropic.ts b/ts/provider.anthropic.ts index 204fae1..6c5ccf4 100644 --- a/ts/provider.anthropic.ts +++ b/ts/provider.anthropic.ts @@ -130,4 +130,17 @@ export class AnthropicProvider extends MultiModalModel { // Anthropic does not provide an audio API, so this method is not implemented. throw new Error('Audio generation is not yet supported by Anthropic.'); } + + public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { + throw new Error('Vision tasks are not yet supported by Anthropic.'); + } + + public async document(optionsArg: { + systemMessage: string; + userMessage: string; + pdfDocuments: Uint8Array[]; + messageHistory: ChatMessage[]; + }): Promise<{ message: any }> { + throw new Error('Document processing is not yet supported by Anthropic.'); + } } \ No newline at end of file diff --git a/ts/provider.groq.ts b/ts/provider.groq.ts index a5b310e..6cc6029 100644 --- a/ts/provider.groq.ts +++ b/ts/provider.groq.ts @@ -176,4 +176,17 @@ export class GroqProvider extends MultiModalModel { // Groq does not provide an audio API, so this method is not implemented. throw new Error('Audio generation is not yet supported by Groq.'); } + + public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { + throw new Error('Vision tasks are not yet supported by Groq.'); + } + + public async document(optionsArg: { + systemMessage: string; + userMessage: string; + pdfDocuments: Uint8Array[]; + messageHistory: ChatMessage[]; + }): Promise<{ message: any }> { + throw new Error('Document processing is not yet supported by Groq.'); + } } \ No newline at end of file diff --git a/ts/provider.ollama.ts b/ts/provider.ollama.ts index 4ace9de..c34d1cb 100644 --- a/ts/provider.ollama.ts +++ b/ts/provider.ollama.ts @@ -6,18 +6,21 @@ import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes. export interface IOllamaProviderOptions { baseUrl?: string; model?: string; + visionModel?: string; // Model to use for vision tasks (e.g. 'llava') } export class OllamaProvider extends MultiModalModel { private options: IOllamaProviderOptions; private baseUrl: string; private model: string; + private visionModel: string; constructor(optionsArg: IOllamaProviderOptions = {}) { super(); this.options = optionsArg; this.baseUrl = optionsArg.baseUrl || 'http://localhost:11434'; this.model = optionsArg.model || 'llama2'; + this.visionModel = optionsArg.visionModel || 'llava'; } async start() { @@ -167,4 +170,83 @@ export class OllamaProvider extends MultiModalModel { public async audio(optionsArg: { message: string }): Promise { throw new Error('Audio generation is not supported by Ollama.'); } + + public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { + const base64Image = optionsArg.image.toString('base64'); + + const response = await fetch(`${this.baseUrl}/api/chat`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: this.visionModel, + messages: [{ + role: 'user', + content: optionsArg.prompt, + images: [base64Image] + }], + stream: false + }), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.statusText}`); + } + + const result = await response.json(); + return result.message.content; + } + + public async document(optionsArg: { + systemMessage: string; + userMessage: string; + pdfDocuments: Uint8Array[]; + messageHistory: ChatMessage[]; + }): Promise<{ message: any }> { + // Convert PDF documents to images using SmartPDF + const smartpdfInstance = new plugins.smartpdf.SmartPdf(); + let documentImageBytesArray: Uint8Array[] = []; + + for (const pdfDocument of optionsArg.pdfDocuments) { + const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument); + documentImageBytesArray = documentImageBytesArray.concat(documentImageArray); + } + + // Convert images to base64 + const base64Images = documentImageBytesArray.map(bytes => Buffer.from(bytes).toString('base64')); + + // Send request to Ollama with images + const response = await fetch(`${this.baseUrl}/api/chat`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: this.visionModel, + messages: [ + { role: 'system', content: optionsArg.systemMessage }, + ...optionsArg.messageHistory, + { + role: 'user', + content: optionsArg.userMessage, + images: base64Images + } + ], + stream: false + }), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.statusText}`); + } + + const result = await response.json(); + return { + message: { + role: 'assistant', + content: result.message.content + } + }; + } } \ No newline at end of file diff --git a/ts/provider.openai.ts b/ts/provider.openai.ts index 31f8215..d266987 100644 --- a/ts/provider.openai.ts +++ b/ts/provider.openai.ts @@ -192,4 +192,27 @@ export class OpenAiProvider extends MultiModalModel { message: result.choices[0].message, }; } + + public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { + const result = await this.openAiApiClient.chat.completions.create({ + model: 'gpt-4-vision-preview', + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: optionsArg.prompt }, + { + type: 'image_url', + image_url: { + url: `data:image/jpeg;base64,${optionsArg.image.toString('base64')}` + } + } + ] + } + ], + max_tokens: 300 + }); + + return result.choices[0].message.content || ''; + } } diff --git a/ts/provider.perplexity.ts b/ts/provider.perplexity.ts index b86c443..ee19647 100644 --- a/ts/provider.perplexity.ts +++ b/ts/provider.perplexity.ts @@ -155,4 +155,17 @@ export class PerplexityProvider extends MultiModalModel { public async audio(optionsArg: { message: string }): Promise { throw new Error('Audio generation is not supported by Perplexity.'); } + + public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { + throw new Error('Vision tasks are not supported by Perplexity.'); + } + + public async document(optionsArg: { + systemMessage: string; + userMessage: string; + pdfDocuments: Uint8Array[]; + messageHistory: ChatMessage[]; + }): Promise<{ message: any }> { + throw new Error('Document processing is not supported by Perplexity.'); + } } \ No newline at end of file