feat(provider.anthropic): Add support for vision and document processing in Anthropic provider
This commit is contained in:
parent
1c49af74ac
commit
ad5dd4799b
@ -1,5 +1,12 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2025-02-03 - 0.2.0 - feat(provider.anthropic)
|
||||||
|
Add support for vision and document processing in Anthropic provider
|
||||||
|
|
||||||
|
- Implemented vision tasks for Anthropic provider using Claude-3-opus-20240229 model.
|
||||||
|
- Implemented document processing for Anthropic provider, supporting conversion of PDF documents to images and analysis with Claude-3-opus-20240229 model.
|
||||||
|
- Updated documentation to reflect the new capabilities of the Anthropic provider.
|
||||||
|
|
||||||
## 2025-02-03 - 0.1.0 - feat(providers)
|
## 2025-02-03 - 0.1.0 - feat(providers)
|
||||||
Add vision and document processing capabilities to providers
|
Add vision and document processing capabilities to providers
|
||||||
|
|
||||||
|
20
readme.md
20
readme.md
@ -26,7 +26,7 @@ This command installs the package and adds it to your project's dependencies.
|
|||||||
|
|
||||||
### Anthropic
|
### Anthropic
|
||||||
- Models: Claude-3-opus-20240229
|
- Models: Claude-3-opus-20240229
|
||||||
- Features: Chat, Streaming
|
- Features: Chat, Streaming, Vision, Document Processing
|
||||||
- Configuration:
|
- Configuration:
|
||||||
```typescript
|
```typescript
|
||||||
anthropicToken: 'your-anthropic-token'
|
anthropicToken: 'your-anthropic-token'
|
||||||
@ -148,7 +148,7 @@ const audioStream = await smartAi.openaiProvider.audio({
|
|||||||
|
|
||||||
### Document Processing
|
### Document Processing
|
||||||
|
|
||||||
For providers that support document processing (OpenAI and Ollama):
|
For providers that support document processing (OpenAI, Ollama, and Anthropic):
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
// Using OpenAI
|
// Using OpenAI
|
||||||
@ -166,6 +166,14 @@ const analysis = await smartAi.ollamaProvider.document({
|
|||||||
messageHistory: [],
|
messageHistory: [],
|
||||||
pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
|
pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Using Anthropic with Claude 3
|
||||||
|
const anthropicAnalysis = await smartAi.anthropicProvider.document({
|
||||||
|
systemMessage: 'You are a document analysis assistant',
|
||||||
|
userMessage: 'Please analyze this document and extract key information',
|
||||||
|
messageHistory: [],
|
||||||
|
pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
Both providers will:
|
Both providers will:
|
||||||
@ -175,7 +183,7 @@ Both providers will:
|
|||||||
|
|
||||||
### Vision Processing
|
### Vision Processing
|
||||||
|
|
||||||
For providers that support vision tasks (OpenAI and Ollama):
|
For providers that support vision tasks (OpenAI, Ollama, and Anthropic):
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
// Using OpenAI's GPT-4 Vision
|
// Using OpenAI's GPT-4 Vision
|
||||||
@ -189,6 +197,12 @@ const analysis = await smartAi.ollamaProvider.vision({
|
|||||||
image: imageBuffer,
|
image: imageBuffer,
|
||||||
prompt: 'Analyze this image in detail'
|
prompt: 'Analyze this image in detail'
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Using Anthropic's Claude 3
|
||||||
|
const anthropicAnalysis = await smartAi.anthropicProvider.vision({
|
||||||
|
image: imageBuffer,
|
||||||
|
prompt: 'Please analyze this image and describe what you see'
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
## Error Handling
|
## Error Handling
|
||||||
|
@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: '@push.rocks/smartai',
|
name: '@push.rocks/smartai',
|
||||||
version: '0.1.0',
|
version: '0.2.0',
|
||||||
description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
|
description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,9 @@ import * as plugins from './plugins.js';
|
|||||||
import * as paths from './paths.js';
|
import * as paths from './paths.js';
|
||||||
import { MultiModalModel } from './abstract.classes.multimodal.js';
|
import { MultiModalModel } from './abstract.classes.multimodal.js';
|
||||||
import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js';
|
import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.multimodal.js';
|
||||||
|
import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages';
|
||||||
|
|
||||||
|
type ContentBlock = ImageBlockParam | TextBlockParam;
|
||||||
|
|
||||||
export interface IAnthropicProviderOptions {
|
export interface IAnthropicProviderOptions {
|
||||||
anthropicToken: string;
|
anthropicToken: string;
|
||||||
@ -132,7 +135,40 @@ export class AnthropicProvider extends MultiModalModel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||||
throw new Error('Vision tasks are not yet supported by Anthropic.');
|
const base64Image = optionsArg.image.toString('base64');
|
||||||
|
|
||||||
|
const content: ContentBlock[] = [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: optionsArg.prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: 'image',
|
||||||
|
source: {
|
||||||
|
type: 'base64',
|
||||||
|
media_type: 'image/jpeg',
|
||||||
|
data: base64Image
|
||||||
|
}
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = await this.anthropicApiClient.messages.create({
|
||||||
|
model: 'claude-3-opus-20240229',
|
||||||
|
messages: [{
|
||||||
|
role: 'user',
|
||||||
|
content
|
||||||
|
}],
|
||||||
|
max_tokens: 1024
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract text content from the response
|
||||||
|
let message = '';
|
||||||
|
for (const block of result.content) {
|
||||||
|
if ('text' in block) {
|
||||||
|
message += block.text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return message;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async document(optionsArg: {
|
public async document(optionsArg: {
|
||||||
@ -141,6 +177,64 @@ export class AnthropicProvider extends MultiModalModel {
|
|||||||
pdfDocuments: Uint8Array[];
|
pdfDocuments: Uint8Array[];
|
||||||
messageHistory: ChatMessage[];
|
messageHistory: ChatMessage[];
|
||||||
}): Promise<{ message: any }> {
|
}): Promise<{ message: any }> {
|
||||||
throw new Error('Document processing is not yet supported by Anthropic.');
|
// Convert PDF documents to images using SmartPDF
|
||||||
|
const smartpdfInstance = new plugins.smartpdf.SmartPdf();
|
||||||
|
let documentImageBytesArray: Uint8Array[] = [];
|
||||||
|
|
||||||
|
for (const pdfDocument of optionsArg.pdfDocuments) {
|
||||||
|
const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
|
||||||
|
documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert message history to Anthropic format
|
||||||
|
const messages = optionsArg.messageHistory.map(msg => ({
|
||||||
|
role: msg.role === 'assistant' ? 'assistant' as const : 'user' as const,
|
||||||
|
content: msg.content
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Create content array with text and images
|
||||||
|
const content: ContentBlock[] = [
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: optionsArg.userMessage
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
// Add each document page as an image
|
||||||
|
for (const imageBytes of documentImageBytesArray) {
|
||||||
|
content.push({
|
||||||
|
type: 'image',
|
||||||
|
source: {
|
||||||
|
type: 'base64',
|
||||||
|
media_type: 'image/jpeg',
|
||||||
|
data: Buffer.from(imageBytes).toString('base64')
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.anthropicApiClient.messages.create({
|
||||||
|
model: 'claude-3-opus-20240229',
|
||||||
|
system: optionsArg.systemMessage,
|
||||||
|
messages: [
|
||||||
|
...messages,
|
||||||
|
{ role: 'user', content }
|
||||||
|
],
|
||||||
|
max_tokens: 4096
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract text content from the response
|
||||||
|
let message = '';
|
||||||
|
for (const block of result.content) {
|
||||||
|
if ('text' in block) {
|
||||||
|
message += block.text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
message: {
|
||||||
|
role: 'assistant',
|
||||||
|
content: message
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user