feat(providers): Add vision and document processing capabilities to providers
This commit is contained in:
parent
e82c510094
commit
eda8ce36df
@ -1,5 +1,13 @@
|
||||
# Changelog
|
||||
|
||||
## 2025-02-03 - 0.1.0 - feat(providers)
|
||||
Add vision and document processing capabilities to providers
|
||||
|
||||
- OpenAI and Ollama providers now support vision tasks using GPT-4 Vision and Llava models respectively.
|
||||
- Document processing has been implemented for OpenAI and Ollama providers, converting PDFs to images for analysis.
|
||||
- Introduced abstract methods for vision and document processing in the MultiModalModel class.
|
||||
- Updated the readme file with examples for vision and document processing.
|
||||
|
||||
## 2025-02-03 - 0.0.19 - fix(core)
|
||||
Enhanced chat streaming and error handling across providers
|
||||
|
||||
|
43
readme.md
43
readme.md
@ -17,8 +17,8 @@ This command installs the package and adds it to your project's dependencies.
|
||||
@push.rocks/smartai supports multiple AI providers, each with its own unique capabilities:
|
||||
|
||||
### OpenAI
|
||||
- Models: GPT-4, GPT-3.5-turbo
|
||||
- Features: Chat, Streaming, Audio Generation
|
||||
- Models: GPT-4, GPT-3.5-turbo, GPT-4-vision-preview
|
||||
- Features: Chat, Streaming, Audio Generation, Vision, Document Processing
|
||||
- Configuration:
|
||||
```typescript
|
||||
openaiToken: 'your-openai-token'
|
||||
@ -49,12 +49,13 @@ This command installs the package and adds it to your project's dependencies.
|
||||
```
|
||||
|
||||
### Ollama
|
||||
- Models: Configurable (default: llama2)
|
||||
- Features: Chat, Streaming
|
||||
- Models: Configurable (default: llama2, llava for vision/documents)
|
||||
- Features: Chat, Streaming, Vision, Document Processing
|
||||
- Configuration:
|
||||
```typescript
|
||||
baseUrl: 'http://localhost:11434' // Optional
|
||||
model: 'llama2' // Optional
|
||||
visionModel: 'llava' // Optional, for vision and document tasks
|
||||
```
|
||||
|
||||
## Usage
|
||||
@ -147,15 +148,47 @@ const audioStream = await smartAi.openaiProvider.audio({
|
||||
|
||||
### Document Processing
|
||||
|
||||
For providers that support document processing (currently OpenAI):
|
||||
For providers that support document processing (OpenAI and Ollama):
|
||||
|
||||
```typescript
|
||||
// Using OpenAI
|
||||
const result = await smartAi.openaiProvider.document({
|
||||
systemMessage: 'Classify the document type',
|
||||
userMessage: 'What type of document is this?',
|
||||
messageHistory: [],
|
||||
pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
|
||||
});
|
||||
|
||||
// Using Ollama with llava
|
||||
const analysis = await smartAi.ollamaProvider.document({
|
||||
systemMessage: 'You are a document analysis assistant',
|
||||
userMessage: 'Extract the key information from this document',
|
||||
messageHistory: [],
|
||||
pdfDocuments: [pdfBuffer] // Uint8Array of PDF content
|
||||
});
|
||||
```
|
||||
|
||||
Both providers will:
|
||||
1. Convert PDF documents to images
|
||||
2. Process each page using their vision models
|
||||
3. Return a comprehensive analysis based on the system message and user query
|
||||
|
||||
### Vision Processing
|
||||
|
||||
For providers that support vision tasks (OpenAI and Ollama):
|
||||
|
||||
```typescript
|
||||
// Using OpenAI's GPT-4 Vision
|
||||
const description = await smartAi.openaiProvider.vision({
|
||||
image: imageBuffer, // Buffer containing the image data
|
||||
prompt: 'What do you see in this image?'
|
||||
});
|
||||
|
||||
// Using Ollama's Llava model
|
||||
const analysis = await smartAi.ollamaProvider.vision({
|
||||
image: imageBuffer,
|
||||
prompt: 'Analyze this image in detail'
|
||||
});
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
@ -3,6 +3,6 @@
|
||||
*/
|
||||
export const commitinfo = {
|
||||
name: '@push.rocks/smartai',
|
||||
version: '0.0.19',
|
||||
version: '0.1.0',
|
||||
description: 'A TypeScript library for integrating and interacting with multiple AI models, offering capabilities for chat and potentially audio responses.'
|
||||
}
|
||||
|
@ -62,4 +62,25 @@ export abstract class MultiModalModel {
|
||||
* @throws Error if the provider doesn't support audio generation
|
||||
*/
|
||||
public abstract audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream>;
|
||||
|
||||
/**
|
||||
* Vision-language processing
|
||||
* @param optionsArg Options containing the image and prompt for analysis
|
||||
* @returns Promise resolving to the model's description or analysis of the image
|
||||
* @throws Error if the provider doesn't support vision tasks
|
||||
*/
|
||||
public abstract vision(optionsArg: { image: Buffer; prompt: string }): Promise<string>;
|
||||
|
||||
/**
|
||||
* Document analysis and processing
|
||||
* @param optionsArg Options containing system message, user message, PDF documents, and message history
|
||||
* @returns Promise resolving to the model's analysis of the documents
|
||||
* @throws Error if the provider doesn't support document processing
|
||||
*/
|
||||
public abstract document(optionsArg: {
|
||||
systemMessage: string;
|
||||
userMessage: string;
|
||||
pdfDocuments: Uint8Array[];
|
||||
messageHistory: ChatMessage[];
|
||||
}): Promise<{ message: any }>;
|
||||
}
|
||||
|
@ -130,4 +130,17 @@ export class AnthropicProvider extends MultiModalModel {
|
||||
// Anthropic does not provide an audio API, so this method is not implemented.
|
||||
throw new Error('Audio generation is not yet supported by Anthropic.');
|
||||
}
|
||||
|
||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||
throw new Error('Vision tasks are not yet supported by Anthropic.');
|
||||
}
|
||||
|
||||
public async document(optionsArg: {
|
||||
systemMessage: string;
|
||||
userMessage: string;
|
||||
pdfDocuments: Uint8Array[];
|
||||
messageHistory: ChatMessage[];
|
||||
}): Promise<{ message: any }> {
|
||||
throw new Error('Document processing is not yet supported by Anthropic.');
|
||||
}
|
||||
}
|
@ -176,4 +176,17 @@ export class GroqProvider extends MultiModalModel {
|
||||
// Groq does not provide an audio API, so this method is not implemented.
|
||||
throw new Error('Audio generation is not yet supported by Groq.');
|
||||
}
|
||||
|
||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||
throw new Error('Vision tasks are not yet supported by Groq.');
|
||||
}
|
||||
|
||||
public async document(optionsArg: {
|
||||
systemMessage: string;
|
||||
userMessage: string;
|
||||
pdfDocuments: Uint8Array[];
|
||||
messageHistory: ChatMessage[];
|
||||
}): Promise<{ message: any }> {
|
||||
throw new Error('Document processing is not yet supported by Groq.');
|
||||
}
|
||||
}
|
@ -6,18 +6,21 @@ import type { ChatOptions, ChatResponse, ChatMessage } from './abstract.classes.
|
||||
export interface IOllamaProviderOptions {
|
||||
baseUrl?: string;
|
||||
model?: string;
|
||||
visionModel?: string; // Model to use for vision tasks (e.g. 'llava')
|
||||
}
|
||||
|
||||
export class OllamaProvider extends MultiModalModel {
|
||||
private options: IOllamaProviderOptions;
|
||||
private baseUrl: string;
|
||||
private model: string;
|
||||
private visionModel: string;
|
||||
|
||||
constructor(optionsArg: IOllamaProviderOptions = {}) {
|
||||
super();
|
||||
this.options = optionsArg;
|
||||
this.baseUrl = optionsArg.baseUrl || 'http://localhost:11434';
|
||||
this.model = optionsArg.model || 'llama2';
|
||||
this.visionModel = optionsArg.visionModel || 'llava';
|
||||
}
|
||||
|
||||
async start() {
|
||||
@ -167,4 +170,83 @@ export class OllamaProvider extends MultiModalModel {
|
||||
public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
|
||||
throw new Error('Audio generation is not supported by Ollama.');
|
||||
}
|
||||
|
||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||
const base64Image = optionsArg.image.toString('base64');
|
||||
|
||||
const response = await fetch(`${this.baseUrl}/api/chat`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: this.visionModel,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
content: optionsArg.prompt,
|
||||
images: [base64Image]
|
||||
}],
|
||||
stream: false
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
return result.message.content;
|
||||
}
|
||||
|
||||
public async document(optionsArg: {
|
||||
systemMessage: string;
|
||||
userMessage: string;
|
||||
pdfDocuments: Uint8Array[];
|
||||
messageHistory: ChatMessage[];
|
||||
}): Promise<{ message: any }> {
|
||||
// Convert PDF documents to images using SmartPDF
|
||||
const smartpdfInstance = new plugins.smartpdf.SmartPdf();
|
||||
let documentImageBytesArray: Uint8Array[] = [];
|
||||
|
||||
for (const pdfDocument of optionsArg.pdfDocuments) {
|
||||
const documentImageArray = await smartpdfInstance.convertPDFToPngBytes(pdfDocument);
|
||||
documentImageBytesArray = documentImageBytesArray.concat(documentImageArray);
|
||||
}
|
||||
|
||||
// Convert images to base64
|
||||
const base64Images = documentImageBytesArray.map(bytes => Buffer.from(bytes).toString('base64'));
|
||||
|
||||
// Send request to Ollama with images
|
||||
const response = await fetch(`${this.baseUrl}/api/chat`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: this.visionModel,
|
||||
messages: [
|
||||
{ role: 'system', content: optionsArg.systemMessage },
|
||||
...optionsArg.messageHistory,
|
||||
{
|
||||
role: 'user',
|
||||
content: optionsArg.userMessage,
|
||||
images: base64Images
|
||||
}
|
||||
],
|
||||
stream: false
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
return {
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: result.message.content
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@ -192,4 +192,27 @@ export class OpenAiProvider extends MultiModalModel {
|
||||
message: result.choices[0].message,
|
||||
};
|
||||
}
|
||||
|
||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||
const result = await this.openAiApiClient.chat.completions.create({
|
||||
model: 'gpt-4-vision-preview',
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: optionsArg.prompt },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${optionsArg.image.toString('base64')}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: 300
|
||||
});
|
||||
|
||||
return result.choices[0].message.content || '';
|
||||
}
|
||||
}
|
||||
|
@ -155,4 +155,17 @@ export class PerplexityProvider extends MultiModalModel {
|
||||
public async audio(optionsArg: { message: string }): Promise<NodeJS.ReadableStream> {
|
||||
throw new Error('Audio generation is not supported by Perplexity.');
|
||||
}
|
||||
|
||||
public async vision(optionsArg: { image: Buffer; prompt: string }): Promise<string> {
|
||||
throw new Error('Vision tasks are not supported by Perplexity.');
|
||||
}
|
||||
|
||||
public async document(optionsArg: {
|
||||
systemMessage: string;
|
||||
userMessage: string;
|
||||
pdfDocuments: Uint8Array[];
|
||||
messageHistory: ChatMessage[];
|
||||
}): Promise<{ message: any }> {
|
||||
throw new Error('Document processing is not supported by Perplexity.');
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user