From ad5dd4799b6752f14d5bc938d54f0cbb604cd651 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Mon, 3 Feb 2025 17:48:36 +0100 Subject: [PATCH] feat(provider.anthropic): Add support for vision and document processing in Anthropic provider --- changelog.md | 7 +++ readme.md | 20 ++++++-- ts/00_commitinfo_data.ts | 2 +- ts/provider.anthropic.ts | 98 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 121 insertions(+), 6 deletions(-) diff --git a/changelog.md b/changelog.md index 313b534..2298235 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # Changelog +## 2025-02-03 - 0.2.0 - feat(provider.anthropic) +Add support for vision and document processing in Anthropic provider + +- Implemented vision tasks for Anthropic provider using Claude-3-opus-20240229 model. +- Implemented document processing for Anthropic provider, supporting conversion of PDF documents to images and analysis with Claude-3-opus-20240229 model. +- Updated documentation to reflect the new capabilities of the Anthropic provider. + ## 2025-02-03 - 0.1.0 - feat(providers) Add vision and document processing capabilities to providers diff --git a/readme.md b/readme.md index 472e030..6e29a67 100644 --- a/readme.md +++ b/readme.md @@ -26,7 +26,7 @@ This command installs the package and adds it to your project's dependencies. ### Anthropic - Models: Claude-3-opus-20240229 -- Features: Chat, Streaming +- Features: Chat, Streaming, Vision, Document Processing - Configuration: ```typescript anthropicToken: 'your-anthropic-token' @@ -148,7 +148,7 @@ const audioStream = await smartAi.openaiProvider.audio({ ### Document Processing -For providers that support document processing (OpenAI and Ollama): +For providers that support document processing (OpenAI, Ollama, and Anthropic): ```typescript // Using OpenAI @@ -166,6 +166,14 @@ const analysis = await smartAi.ollamaProvider.document({ messageHistory: [], pdfDocuments: [pdfBuffer] // Uint8Array of PDF content }); + +// Using Anthropic with Claude 3 +const anthropicAnalysis = await smartAi.anthropicProvider.document({ + systemMessage: 'You are a document analysis assistant', + userMessage: 'Please analyze this document and extract key information', + messageHistory: [], + pdfDocuments: [pdfBuffer] // Uint8Array of PDF content +}); ``` Both providers will: @@ -175,7 +183,7 @@ Both providers will: ### Vision Processing -For providers that support vision tasks (OpenAI and Ollama): +For providers that support vision tasks (OpenAI, Ollama, and Anthropic): ```typescript // Using OpenAI's GPT-4 Vision @@ -189,6 +197,12 @@ const analysis = await smartAi.ollamaProvider.vision({ image: imageBuffer, prompt: 'Analyze this image in detail' }); + +// Using Anthropic's Claude 3 +const anthropicAnalysis = await smartAi.anthropicProvider.vision({ + image: imageBuffer, + prompt: 'Please analyze this image and describe what you see' +}); ``` ## Error Handling diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 6db5558..8650af2 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@push.rocks/smartai', - version: '0.1.0', + version: '0.2.0', description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.' } diff --git a/ts/provider.anthropic.ts b/ts/provider.anthropic.ts index 6c5ccf4..5a364f3 100644 --- a/ts/provider.anthropic.ts +++ b/ts/provider.anthropic.ts @@ -2,6 +2,9 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js'; +import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages'; + +type ContentBlock = ImageBlockParam | TextBlockParam; export interface IAnthropicProviderOptions { anthropicToken: string; @@ -132,7 +135,40 @@ export class AnthropicProvider extends MultiModalModel { } public async vision(optionsArg: { image: Buffer; prompt: string }): Promise { - throw new Error('Vision tasks are not yet supported by Anthropic.'); + const base64Image = optionsArg.image.toString('base64'); + + const content: ContentBlock[] = [ + { + type: 'text', + text: optionsArg.prompt + }, + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/jpeg', + data: base64Image + } + } + ]; + + const result = await this.anthropicApiClient.messages.create({ + model: 'claude-3-opus-20240229', + messages: [{ + role: 'user', + content + }], + max_tokens: 1024 + }); + + // Extract text content from the response + let message = ''; + for (const block of result.content) { + if ('text' in block) { + message += block.text; + } + } + return message; } public async document(optionsArg: { @@ -141,6 +177,64 @@ export class AnthropicProvider extends MultiModalModel { pdfDocuments: Uint8Array[]; messageHistory: ChatMessage[]; }): Promise<{ message: any }> { - throw new Error('Document processing is not yet supported by Anthropic.'); + // Convert PDF documents to images using SmartPDF + const smartpdfInstance = new plugins.smartpdf.SmartPdf(); + let documentImageBytesArray: Uint8Array[] = []; + + for (const pdfDocument of optionsArg.pdfDocuments) { + const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument); + documentImageBytesArray = documentImageBytesArray.concat(documentImageArray); + } + + // Convert message history to Anthropic format + const messages = optionsArg.messageHistory.map(msg => ({ + role: msg.role === 'assistant' ? 'assistant' as const : 'user' as const, + content: msg.content + })); + + // Create content array with text and images + const content: ContentBlock[] = [ + { + type: 'text', + text: optionsArg.userMessage + } + ]; + + // Add each document page as an image + for (const imageBytes of documentImageBytesArray) { + content.push({ + type: 'image', + source: { + type: 'base64', + media_type: 'image/jpeg', + data: Buffer.from(imageBytes).toString('base64') + } + }); + } + + const result = await this.anthropicApiClient.messages.create({ + model: 'claude-3-opus-20240229', + system: optionsArg.systemMessage, + messages: [ + ...messages, + { role: 'user', content } + ], + max_tokens: 4096 + }); + + // Extract text content from the response + let message = ''; + for (const block of result.content) { + if ('text' in block) { + message += block.text; + } + } + + return { + message: { + role: 'assistant', + content: message + } + }; } } \ No newline at end of file