feat(provider.anthropic): Add support for vision and document processing in Anthropic provider
This commit is contained in:
		@@ -1,5 +1,12 @@
 | 
				
			|||||||
# Changelog
 | 
					# Changelog
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## 2025-02-03 - 0.2.0 - feat(provider.anthropic)
 | 
				
			||||||
 | 
					Add support for vision and document processing in Anthropic provider
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Implemented vision tasks for Anthropic provider using Claude-3-opus-20240229 model.
 | 
				
			||||||
 | 
					- Implemented document processing for Anthropic provider, supporting conversion of PDF documents to images and analysis with Claude-3-opus-20240229 model.
 | 
				
			||||||
 | 
					- Updated documentation to reflect the new capabilities of the Anthropic provider.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 2025-02-03 - 0.1.0 - feat(providers)
 | 
					## 2025-02-03 - 0.1.0 - feat(providers)
 | 
				
			||||||
Add vision and document processing capabilities to providers
 | 
					Add vision and document processing capabilities to providers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										20
									
								
								readme.md
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								readme.md
									
									
									
									
									
								
							@@ -26,7 +26,7 @@ This command installs the package and adds it to your project's dependencies.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### Anthropic
 | 
					### Anthropic
 | 
				
			||||||
- Models: Claude-3-opus-20240229
 | 
					- Models: Claude-3-opus-20240229
 | 
				
			||||||
- Features: Chat, Streaming
 | 
					- Features: Chat, Streaming, Vision, Document Processing
 | 
				
			||||||
- Configuration:
 | 
					- Configuration:
 | 
				
			||||||
  ```typescript
 | 
					  ```typescript
 | 
				
			||||||
  anthropicToken: 'your-anthropic-token'
 | 
					  anthropicToken: 'your-anthropic-token'
 | 
				
			||||||
@@ -148,7 +148,7 @@ const audioStream = await smartAi.openaiProvider.audio({
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### Document Processing
 | 
					### Document Processing
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For providers that support document processing (OpenAI and Ollama):
 | 
					For providers that support document processing (OpenAI, Ollama, and Anthropic):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```typescript
 | 
					```typescript
 | 
				
			||||||
// Using OpenAI
 | 
					// Using OpenAI
 | 
				
			||||||
@@ -166,6 +166,14 @@ const analysis = await smartAi.ollamaProvider.document({
 | 
				
			|||||||
  messageHistory: [],
 | 
					  messageHistory: [],
 | 
				
			||||||
  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
 | 
					  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Using Anthropic with Claude 3
 | 
				
			||||||
 | 
					const anthropicAnalysis = await smartAi.anthropicProvider.document({
 | 
				
			||||||
 | 
					  systemMessage: 'You are a document analysis assistant',
 | 
				
			||||||
 | 
					  userMessage: 'Please analyze this document and extract key information',
 | 
				
			||||||
 | 
					  messageHistory: [],
 | 
				
			||||||
 | 
					  pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Both providers will:
 | 
					Both providers will:
 | 
				
			||||||
@@ -175,7 +183,7 @@ Both providers will:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### Vision Processing
 | 
					### Vision Processing
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For providers that support vision tasks (OpenAI and Ollama):
 | 
					For providers that support vision tasks (OpenAI, Ollama, and Anthropic):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```typescript
 | 
					```typescript
 | 
				
			||||||
// Using OpenAI's GPT-4 Vision
 | 
					// Using OpenAI's GPT-4 Vision
 | 
				
			||||||
@@ -189,6 +197,12 @@ const analysis = await smartAi.ollamaProvider.vision({
 | 
				
			|||||||
  image: imageBuffer,
 | 
					  image: imageBuffer,
 | 
				
			||||||
  prompt: 'Analyze this image in detail'
 | 
					  prompt: 'Analyze this image in detail'
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Using Anthropic's Claude 3
 | 
				
			||||||
 | 
					const anthropicAnalysis = await smartAi.anthropicProvider.vision({
 | 
				
			||||||
 | 
					  image: imageBuffer,
 | 
				
			||||||
 | 
					  prompt: 'Please analyze this image and describe what you see'
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Error Handling
 | 
					## Error Handling
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,6 +3,6 @@
 | 
				
			|||||||
 */
 | 
					 */
 | 
				
			||||||
export const commitinfo = {
 | 
					export const commitinfo = {
 | 
				
			||||||
  name: '@push.rocks/smartai',
 | 
					  name: '@push.rocks/smartai',
 | 
				
			||||||
  version: '0.1.0',
 | 
					  version: '0.2.0',
 | 
				
			||||||
  description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
 | 
					  description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2,6 +2,9 @@ import * as plugins from './plugins.js';
 | 
				
			|||||||
import * as paths from './paths.js';
 | 
					import * as paths from './paths.js';
 | 
				
			||||||
import { MultiModalModel } from './abstract.classes.multimodal.js';
 | 
					import { MultiModalModel } from './abstract.classes.multimodal.js';
 | 
				
			||||||
import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js';
 | 
					import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js';
 | 
				
			||||||
 | 
					import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					type ContentBlock = ImageBlockParam | TextBlockParam;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export interface IAnthropicProviderOptions {
 | 
					export interface IAnthropicProviderOptions {
 | 
				
			||||||
  anthropicToken: string;
 | 
					  anthropicToken: string;
 | 
				
			||||||
@@ -132,7 +135,40 @@ export class AnthropicProvider extends MultiModalModel {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
 | 
					  public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
 | 
				
			||||||
    throw new Error('Vision tasks are not yet supported by Anthropic.');
 | 
					    const base64Image = optionsArg.image.toString('base64');
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const content: ContentBlock[] = [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        type: 'text',
 | 
				
			||||||
 | 
					        text: optionsArg.prompt
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        type: 'image',
 | 
				
			||||||
 | 
					        source: {
 | 
				
			||||||
 | 
					          type: 'base64',
 | 
				
			||||||
 | 
					          media_type: 'image/jpeg',
 | 
				
			||||||
 | 
					          data: base64Image
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const result = await this.anthropicApiClient.messages.create({
 | 
				
			||||||
 | 
					      model: 'claude-3-opus-20240229',
 | 
				
			||||||
 | 
					      messages: [{
 | 
				
			||||||
 | 
					        role: 'user',
 | 
				
			||||||
 | 
					        content
 | 
				
			||||||
 | 
					      }],
 | 
				
			||||||
 | 
					      max_tokens: 1024
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Extract text content from the response
 | 
				
			||||||
 | 
					    let message = '';
 | 
				
			||||||
 | 
					    for (const block of result.content) {
 | 
				
			||||||
 | 
					      if ('text' in block) {
 | 
				
			||||||
 | 
					        message += block.text;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    return message;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  public async document(optionsArg: {
 | 
					  public async document(optionsArg: {
 | 
				
			||||||
@@ -141,6 +177,64 @@ export class AnthropicProvider extends MultiModalModel {
 | 
				
			|||||||
    pdfDocuments: Uint8Array[];
 | 
					    pdfDocuments: Uint8Array[];
 | 
				
			||||||
    messageHistory: ChatMessage[];
 | 
					    messageHistory: ChatMessage[];
 | 
				
			||||||
  }): Promise<{ message: any }> {
 | 
					  }): Promise<{ message: any }> {
 | 
				
			||||||
    throw new Error('Document processing is not yet supported by Anthropic.');
 | 
					    // Convert PDF documents to images using SmartPDF
 | 
				
			||||||
 | 
					    const smartpdfInstance = new plugins.smartpdf.SmartPdf();
 | 
				
			||||||
 | 
					    let documentImageBytesArray: Uint8Array[] = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const pdfDocument of optionsArg.pdfDocuments) {
 | 
				
			||||||
 | 
					      const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
 | 
				
			||||||
 | 
					      documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Convert message history to Anthropic format
 | 
				
			||||||
 | 
					    const messages = optionsArg.messageHistory.map(msg => ({
 | 
				
			||||||
 | 
					      role: msg.role === 'assistant' ? 'assistant' as const : 'user' as const,
 | 
				
			||||||
 | 
					      content: msg.content
 | 
				
			||||||
 | 
					    }));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Create content array with text and images
 | 
				
			||||||
 | 
					    const content: ContentBlock[] = [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        type: 'text',
 | 
				
			||||||
 | 
					        text: optionsArg.userMessage
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Add each document page as an image
 | 
				
			||||||
 | 
					    for (const imageBytes of documentImageBytesArray) {
 | 
				
			||||||
 | 
					      content.push({
 | 
				
			||||||
 | 
					        type: 'image',
 | 
				
			||||||
 | 
					        source: {
 | 
				
			||||||
 | 
					          type: 'base64',
 | 
				
			||||||
 | 
					          media_type: 'image/jpeg',
 | 
				
			||||||
 | 
					          data: Buffer.from(imageBytes).toString('base64')
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const result = await this.anthropicApiClient.messages.create({
 | 
				
			||||||
 | 
					      model: 'claude-3-opus-20240229',
 | 
				
			||||||
 | 
					      system: optionsArg.systemMessage,
 | 
				
			||||||
 | 
					      messages: [
 | 
				
			||||||
 | 
					        ...messages,
 | 
				
			||||||
 | 
					        { role: 'user', content }
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      max_tokens: 4096
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Extract text content from the response
 | 
				
			||||||
 | 
					    let message = '';
 | 
				
			||||||
 | 
					    for (const block of result.content) {
 | 
				
			||||||
 | 
					      if ('text' in block) {
 | 
				
			||||||
 | 
					        message += block.text;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      message: {
 | 
				
			||||||
 | 
					        role: 'assistant',
 | 
				
			||||||
 | 
					        content: message
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user