From ad5dd4799b6752f14d5bc938d54f0cbb604cd651 Mon Sep 17 00:00:00 2001
From: Philipp Kunz <gitlab@philkunz.com>
Date: Mon, 3 Feb 2025 17:48:36 +0100
Subject: [PATCH] feat(provider.anthropic): Add support for vision and document
 processing in Anthropic provider

---
 changelog.md             |  7 +++
 readme.md                | 20 ++++++--
 ts/00_commitinfo_data.ts |  2 +-
 ts/provider.anthropic.ts | 98 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/changelog.md b/changelog.md
index 313b534..2298235 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## 2025-02-03 - 0.2.0 - feat(provider.anthropic)
+Add support for vision and document processing in Anthropic provider
+
+- Implemented vision tasks for Anthropic provider using Claude-3-opus-20240229 model.
+- Implemented document processing for Anthropic provider, supporting conversion of PDF documents to images and analysis with Claude-3-opus-20240229 model.
+- Updated documentation to reflect the new capabilities of the Anthropic provider.
+
 ## 2025-02-03 - 0.1.0 - feat(providers)
 Add vision and document processing capabilities to providers
 
diff --git a/readme.md b/readme.md
index 472e030..6e29a67 100644
--- a/readme.md
+++ b/readme.md
@@ -26,7 +26,7 @@ This command installs the package and adds it to your project's dependencies.
 
 ### Anthropic
 - Models: Claude-3-opus-20240229
-- Features: Chat, Streaming
+- Features: Chat, Streaming, Vision, Document Processing
 - Configuration:
   ```typescript
   anthropicToken: 'your-anthropic-token'
@@ -148,7 +148,7 @@ const audioStream = await smartAi.openaiProvider.audio({
 
 ### Document Processing
 
-For providers that support document processing (OpenAI and Ollama):
+For providers that support document processing (OpenAI, Ollama, and Anthropic):
 
 ```typescript
 // Using OpenAI
@@ -166,6 +166,14 @@ const analysis = await smartAi.ollamaProvider.document({
   messageHistory: [],
   pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
 });
+
+// Using Anthropic with Claude 3
+const anthropicAnalysis = await smartAi.anthropicProvider.document({
+  systemMessage: 'You are a document analysis assistant',
+  userMessage: 'Please analyze this document and extract key information',
+  messageHistory: [],
+  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
+});
 ```
 
 Both providers will:
@@ -175,7 +183,7 @@ Both providers will:
 
 ### Vision Processing
 
-For providers that support vision tasks (OpenAI and Ollama):
+For providers that support vision tasks (OpenAI, Ollama, and Anthropic):
 
 ```typescript
 // Using OpenAI's GPT-4 Vision
@@ -189,6 +197,12 @@ const analysis = await smartAi.ollamaProvider.vision({
   image: imageBuffer,
   prompt: 'Analyze this image in detail'
 });
+
+// Using Anthropic's Claude 3
+const anthropicAnalysis = await smartAi.anthropicProvider.vision({
+  image: imageBuffer,
+  prompt: 'Please analyze this image and describe what you see'
+});
 ```
 
 ## Error Handling
diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts
index 6db5558..8650af2 100644
--- a/ts/00_commitinfo_data.ts
+++ b/ts/00_commitinfo_data.ts
@@ -3,6 +3,6 @@
  */
 export const commitinfo = {
   name: '@push.rocks/smartai',
-  version: '0.1.0',
+  version: '0.2.0',
   description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
 }
diff --git a/ts/provider.anthropic.ts b/ts/provider.anthropic.ts
index 6c5ccf4..5a364f3 100644
--- a/ts/provider.anthropic.ts
+++ b/ts/provider.anthropic.ts
@@ -2,6 +2,9 @@ import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
 import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js';
+import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages';
+
+type ContentBlock = ImageBlockParam | TextBlockParam;
 
 export interface IAnthropicProviderOptions {
   anthropicToken: string;
@@ -132,7 +135,40 @@ export class AnthropicProvider extends MultiModalModel {
   }
 
   public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
-    throw new Error('Vision tasks are not yet supported by Anthropic.');
+    const base64Image = optionsArg.image.toString('base64');
+    
+    const content: ContentBlock[] = [
+      {
+        type: 'text',
+        text: optionsArg.prompt
+      },
+      {
+        type: 'image',
+        source: {
+          type: 'base64',
+          media_type: 'image/jpeg',
+          data: base64Image
+        }
+      }
+    ];
+
+    const result = await this.anthropicApiClient.messages.create({
+      model: 'claude-3-opus-20240229',
+      messages: [{
+        role: 'user',
+        content
+      }],
+      max_tokens: 1024
+    });
+
+    // Extract text content from the response
+    let message = '';
+    for (const block of result.content) {
+      if ('text' in block) {
+        message += block.text;
+      }
+    }
+    return message;
   }
 
   public async document(optionsArg: {
@@ -141,6 +177,64 @@ export class AnthropicProvider extends MultiModalModel {
     pdfDocuments: Uint8Array[];
     messageHistory: ChatMessage[];
   }): Promise<{ message: any }> {
-    throw new Error('Document processing is not yet supported by Anthropic.');
+    // Convert PDF documents to images using SmartPDF
+    const smartpdfInstance = new plugins.smartpdf.SmartPdf();
+    let documentImageBytesArray: Uint8Array[] = [];
+
+    for (const pdfDocument of optionsArg.pdfDocuments) {
+      const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
+      documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
+    }
+
+    // Convert message history to Anthropic format
+    const messages = optionsArg.messageHistory.map(msg => ({
+      role: msg.role === 'assistant' ? 'assistant' as const : 'user' as const,
+      content: msg.content
+    }));
+
+    // Create content array with text and images
+    const content: ContentBlock[] = [
+      {
+        type: 'text',
+        text: optionsArg.userMessage
+      }
+    ];
+
+    // Add each document page as an image
+    for (const imageBytes of documentImageBytesArray) {
+      content.push({
+        type: 'image',
+        source: {
+          type: 'base64',
+          media_type: 'image/jpeg',
+          data: Buffer.from(imageBytes).toString('base64')
+        }
+      });
+    }
+
+    const result = await this.anthropicApiClient.messages.create({
+      model: 'claude-3-opus-20240229',
+      system: optionsArg.systemMessage,
+      messages: [
+        ...messages,
+        { role: 'user', content }
+      ],
+      max_tokens: 4096
+    });
+
+    // Extract text content from the response
+    let message = '';
+    for (const block of result.content) {
+      if ('text' in block) {
+        message += block.text;
+      }
+    }
+
+    return {
+      message: {
+        role: 'assistant',
+        content: message
+      }
+    };
   }
 }
\ No newline at end of file