feat(providers): Add vision and document processing capabilities to providers

2025-02-03 15:26:00 +01:00
parent e82c510094
commit eda8ce36df
9 changed files with 212 additions and 6 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,13 @@
 # Changelog

+## 2025-02-03 - 0.1.0 - feat(providers)
+Add vision and document processing capabilities to providers
+
+- OpenAI and Ollama providers now support vision tasks using GPT-4 Vision and Llava models respectively.
+- Document processing has been implemented for OpenAI and Ollama providers, converting PDFs to images for analysis.
+- Introduced abstract methods for vision and document processing in the MultiModalModel class.
+- Updated the readme file with examples for vision and document processing.
+
 ## 2025-02-03 - 0.0.19 - fix(core)
 Enhanced chat streaming and error handling across providers

--- a/readme.md
+++ b/readme.md
@@ -17,8 +17,8 @@ This command installs the package and adds it to your project's dependencies.
@push.rocks/smartai supports multiple AI providers, each with its own unique capabilities:

 ### OpenAI
- Models: GPT-4, GPT-3.5-turbo
- Features: Chat, Streaming, Audio Generation
+- Models: GPT-4, GPT-3.5-turbo, GPT-4-vision-preview
+- Features: Chat, Streaming, Audio Generation, Vision, Document Processing
 - Configuration:
  ```typescript
  openaiToken: 'your-openai-token'
@@ -49,12 +49,13 @@ This command installs the package and adds it to your project's dependencies.
  ```

 ### Ollama
- Models: Configurable (default: llama2)
- Features: Chat, Streaming
+- Models: Configurable (default: llama2, llava for vision/documents)
+- Features: Chat, Streaming, Vision, Document Processing
 - Configuration:
  ```typescript
  baseUrl: 'http://localhost:11434' // Optional
  model: 'llama2' // Optional
+  visionModel: 'llava' // Optional, for vision and document tasks
  ```

 ## Usage
@@ -147,15 +148,47 @@ const audioStream = await smartAi.openaiProvider.audio({

 ### Document Processing

-For providers that support document processing (currently OpenAI):
+For providers that support document processing (OpenAI and Ollama):

 ```typescript
+// Using OpenAI
 const result = await smartAi.openaiProvider.document({
  systemMessage: 'Classify the document type',
  userMessage: 'What type of document is this?',
  messageHistory: [],
  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
 });
+
+// Using Ollama with llava
+const analysis = await smartAi.ollamaProvider.document({
+  systemMessage: 'You are a document analysis assistant',
+  userMessage: 'Extract the key information from this document',
+  messageHistory: [],
+  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
+});
+```
+
+Both providers will:
+1. Convert PDF documents to images
+2. Process each page using their vision models
+3. Return a comprehensive analysis based on the system message and user query
+
+### Vision Processing
+
+For providers that support vision tasks (OpenAI and Ollama):
+
+```typescript
+// Using OpenAI's GPT-4 Vision
+const description = await smartAi.openaiProvider.vision({
+  image: imageBuffer, // Buffer containing the image data
+  prompt: 'What do you see in this image?'
+});
+
+// Using Ollama's Llava model
+const analysis = await smartAi.ollamaProvider.vision({
+  image: imageBuffer,
+  prompt: 'Analyze this image in detail'
+});
 ```

 ## Error Handling
--- a/ts/00_commitinfo_data.ts
+++ b/ts/00_commitinfo_data.ts
@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: '@push.rocks/smartai',
-  version: '0.0.19',
+  version: '0.1.0',
  description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
 }
--- a/ts/abstract.classes.multimodal.ts
+++ b/ts/abstract.classes.multimodal.ts
@@ -62,4 +62,25 @@ export abstract class MultiModalModel {
   * @throws Error if the provider doesn't support audio generation
   */
  public abstract audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream>;
+
+  /**
+   * Vision-language processing
+   * @param optionsArg Options containing the image and prompt for analysis
+   * @returns Promise resolving to the model's description or analysis of the image
+   * @throws Error if the provider doesn't support vision tasks
+   */
+  public abstract vision(optionsArg: { image: Buffer; prompt: string }): Promise<string>;
+
+  /**
+   * Document analysis and processing
+   * @param optionsArg Options containing system message, user message, PDF documents, and message history
+   * @returns Promise resolving to the model's analysis of the documents
+   * @throws Error if the provider doesn't support document processing
+   */
+  public abstract document(optionsArg: {
+    systemMessage: string;
+    userMessage: string;
+    pdfDocuments: Uint8Array[];
+    messageHistory: ChatMessage[];
+  }): Promise<{ message: any }>;
 }
--- a/ts/provider.anthropic.ts
+++ b/ts/provider.anthropic.ts
@@ -130,4 +130,17 @@ export class AnthropicProvider extends MultiModalModel {
    // Anthropic does not provide an audio API, so this method is not implemented.
    throw new Error('Audio generation is not yet supported by Anthropic.');
  }
+
+  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
+    throw new Error('Vision tasks are not yet supported by Anthropic.');
+  }
+
+  public async document(optionsArg: {
+    systemMessage: string;
+    userMessage: string;
+    pdfDocuments: Uint8Array[];
+    messageHistory: ChatMessage[];
+  }): Promise<{ message: any }> {
+    throw new Error('Document processing is not yet supported by Anthropic.');
+  }
 }
--- a/ts/provider.groq.ts
+++ b/ts/provider.groq.ts
@@ -176,4 +176,17 @@ export class GroqProvider extends MultiModalModel {
    // Groq does not provide an audio API, so this method is not implemented.
    throw new Error('Audio generation is not yet supported by Groq.');
  }
+
+  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
+    throw new Error('Vision tasks are not yet supported by Groq.');
+  }
+
+  public async document(optionsArg: {
+    systemMessage: string;
+    userMessage: string;
+    pdfDocuments: Uint8Array[];
+    messageHistory: ChatMessage[];
+  }): Promise<{ message: any }> {
+    throw new Error('Document processing is not yet supported by Groq.');
+  }
 }
--- a/ts/provider.ollama.ts
+++ b/ts/provider.ollama.ts
@@ -6,18 +6,21 @@ import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.
 export interface IOllamaProviderOptions {
  baseUrl?: string;
  model?: string;
+  visionModel?: string; // Model to use for vision tasks (e.g. 'llava')
 }

 export class OllamaProvider extends MultiModalModel {
  private options: IOllamaProviderOptions;
  private baseUrl: string;
  private model: string;
+  private visionModel: string;

  constructor(optionsArg: IOllamaProviderOptions = {}) {
    super();
    this.options = optionsArg;
    this.baseUrl = optionsArg.baseUrl || 'http://localhost:11434';
    this.model = optionsArg.model || 'llama2';
+    this.visionModel = optionsArg.visionModel || 'llava';
  }

  async start() {
@@ -167,4 +170,83 @@ export class OllamaProvider extends MultiModalModel {
  public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
    throw new Error('Audio generation is not supported by Ollama.');
  }
+
+  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
+    const base64Image = optionsArg.image.toString('base64');
+    
+    const response = await fetch(`${this.baseUrl}/api/chat`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: this.visionModel,
+        messages: [{
+          role: 'user',
+          content: optionsArg.prompt,
+          images: [base64Image]
+        }],
+        stream: false
+      }),
+    });
+
+    if (!response.ok) {
+      throw new Error(`Ollama API error: ${response.statusText}`);
+    }
+
+    const result = await response.json();
+    return result.message.content;
+  }
+
+  public async document(optionsArg: {
+    systemMessage: string;
+    userMessage: string;
+    pdfDocuments: Uint8Array[];
+    messageHistory: ChatMessage[];
+  }): Promise<{ message: any }> {
+    // Convert PDF documents to images using SmartPDF
+    const smartpdfInstance = new plugins.smartpdf.SmartPdf();
+    let documentImageBytesArray: Uint8Array[] = [];
+
+    for (const pdfDocument of optionsArg.pdfDocuments) {
+      const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
+      documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
+    }
+
+    // Convert images to base64
+    const base64Images = documentImageBytesArray.map(bytes => Buffer.from(bytes).toString('base64'));
+
+    // Send request to Ollama with images
+    const response = await fetch(`${this.baseUrl}/api/chat`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: this.visionModel,
+        messages: [
+          { role: 'system', content: optionsArg.systemMessage },
+          ...optionsArg.messageHistory,
+          {
+            role: 'user',
+            content: optionsArg.userMessage,
+            images: base64Images
+          }
+        ],
+        stream: false
+      }),
+    });
+
+    if (!response.ok) {
+      throw new Error(`Ollama API error: ${response.statusText}`);
+    }
+
+    const result = await response.json();
+    return {
+      message: {
+        role: 'assistant',
+        content: result.message.content
+      }
+    };
+  }
 }
--- a/ts/provider.openai.ts
+++ b/ts/provider.openai.ts
@@ -192,4 +192,27 @@ export class OpenAiProvider extends MultiModalModel {
      message: result.choices[0].message,
    };
  }
+
+  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
+    const result = await this.openAiApiClient.chat.completions.create({
+      model: 'gpt-4-vision-preview',
+      messages: [
+        {
+          role: 'user',
+          content: [
+            { type: 'text', text: optionsArg.prompt },
+            {
+              type: 'image_url',
+              image_url: {
+                url: `data:image/jpeg;base64,${optionsArg.image.toString('base64')}`
+              }
+            }
+          ]
+        }
+      ],
+      max_tokens: 300
+    });
+
+    return result.choices[0].message.content || '';
+  }
 }
--- a/ts/provider.perplexity.ts
+++ b/ts/provider.perplexity.ts
@@ -155,4 +155,17 @@ export class PerplexityProvider extends MultiModalModel {
  public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
    throw new Error('Audio generation is not supported by Perplexity.');
  }
+
+  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
+    throw new Error('Vision tasks are not supported by Perplexity.');
+  }
+
+  public async document(optionsArg: {
+    systemMessage: string;
+    userMessage: string;
+    pdfDocuments: Uint8Array[];
+    messageHistory: ChatMessage[];
+  }): Promise<{ message: any }> {
+    throw new Error('Document processing is not supported by Perplexity.');
+  }
 }