diff --git a/changelog.md b/changelog.md index be9f136..791e8bd 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,18 @@ # Changelog +## 2025-10-03 - 0.7.0 - feat(providers) +Add research API and image generation/editing support; extend providers and tests + +- Introduce ResearchOptions and ResearchResponse to the MultiModalModel interface and implement research() where supported +- OpenAiProvider: implement research(), add imageGenerate() and imageEdit() methods (gpt-image-1 / DALL·E support), and expose imageModel option +- AnthropicProvider: implement research() and vision handling; explicitly throw for unsupported image generation/editing +- PerplexityProvider: implement research() (sonar / sonar-pro support) and expose citation parsing +- Add image/document-related interfaces (ImageGenerateOptions, ImageEditOptions, ImageResponse) to abstract API +- Add image generation/editing/no-op stubs for other providers (Exo, Groq, Ollama, XAI) that throw informative errors to preserve API compatibility +- Add comprehensive OpenAI image generation tests and helper to save test outputs (test/test.image.openai.ts) +- Update README with Research & Web Search documentation, capability matrix, and roadmap entry for Research & Web Search API +- Add local Claude agent permissions file (.claude/settings.local.json) and various provider type/import updates + ## 2025-09-28 - 0.6.1 - fix(provider.anthropic) Fix Anthropic research tool identifier and add tests + local Claude permissions diff --git a/readme.md b/readme.md index 6365971..ea08544 100644 --- a/readme.md +++ b/readme.md @@ -45,15 +45,15 @@ const response = await ai.openaiProvider.chat({ Choose the right provider for your use case: -| Provider | Chat | Streaming | TTS | Vision | Documents | Highlights | -|----------|:----:|:---------:|:---:|:------:|:---------:|------------| -| **OpenAI** | ✅ | ✅ | ✅ | ✅ | ✅ | • GPT-4, DALL-E 3
• Industry standard
• Most features | -| **Anthropic** | ✅ | ✅ | ❌ | ✅ | ✅ | • Claude 3 Opus
• Superior reasoning
• 200k context | -| **Ollama** | ✅ | ✅ | ❌ | ✅ | ✅ | • 100% local
• Privacy-first
• No API costs | -| **XAI** | ✅ | ✅ | ❌ | ❌ | ✅ | • Grok models
• Real-time data
• Uncensored | -| **Perplexity** | ✅ | ✅ | ❌ | ❌ | ❌ | • Web-aware
• Research-focused
• Citations | -| **Groq** | ✅ | ✅ | ❌ | ❌ | ❌ | • 10x faster
• LPU inference
• Low latency | -| **Exo** | ✅ | ✅ | ❌ | ❌ | ❌ | • Distributed
• P2P compute
• Decentralized | +| Provider | Chat | Streaming | TTS | Vision | Documents | Research | Highlights | +|----------|:----:|:---------:|:---:|:------:|:---------:|:--------:|------------| +| **OpenAI** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | • GPT-4, DALL-E 3
• Industry standard
• Deep research API | +| **Anthropic** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | • Claude 3 Opus
• Superior reasoning
• Web search API | +| **Ollama** | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | • 100% local
• Privacy-first
• No API costs | +| **XAI** | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | • Grok models
• Real-time data
• Uncensored | +| **Perplexity** | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | • Web-aware
• Research-focused
• Sonar Pro models | +| **Groq** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | • 10x faster
• LPU inference
• Low latency | +| **Exo** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | • Distributed
• P2P compute
• Decentralized | ## 🎮 Core Features @@ -171,6 +171,51 @@ const taxAnalysis = await ai.anthropicProvider.document({ }); ``` +### 🔬 Research & Web Search + +Perform deep research with web search capabilities across multiple providers: + +```typescript +// OpenAI Deep Research - Comprehensive analysis +const deepResearch = await ai.openaiProvider.research({ + query: 'What are the latest developments in quantum computing?', + searchDepth: 'deep', + includeWebSearch: true +}); + +console.log(deepResearch.answer); +console.log('Sources:', deepResearch.sources); + +// Anthropic Web Search - Domain-filtered research +const anthropic = new AnthropicProvider({ + anthropicToken: 'sk-ant-...', + enableWebSearch: true, + searchDomainAllowList: ['nature.com', 'science.org'] +}); + +const scientificResearch = await anthropic.research({ + query: 'Latest breakthroughs in CRISPR gene editing', + searchDepth: 'advanced' +}); + +// Perplexity - Research-focused with citations +const perplexityResearch = await ai.perplexityProvider.research({ + query: 'Current state of autonomous vehicle technology', + searchDepth: 'deep' // Uses Sonar Pro model +}); +``` + +**Research Options:** +- `searchDepth`: 'basic' | 'advanced' | 'deep' +- `maxSources`: Number of sources to include +- `includeWebSearch`: Enable web search (OpenAI) +- `background`: Run as background task (OpenAI) + +**Supported Providers:** +- **OpenAI**: Deep Research API with specialized models (`o3-deep-research-2025-06-26`, `o4-mini-deep-research-2025-06-26`) +- **Anthropic**: Web Search API with domain filtering +- **Perplexity**: Sonar and Sonar Pro models with built-in citations + ### 🔄 Persistent Conversations Maintain context across interactions: @@ -447,6 +492,7 @@ export PERPLEXITY_API_KEY=pplx-... | **General Purpose** | OpenAI | Most features, stable, well-documented | | **Complex Reasoning** | Anthropic | Superior logical thinking, safer outputs | | **Research & Facts** | Perplexity | Web-aware, provides citations | +| **Deep Research** | OpenAI | Deep Research API with comprehensive analysis | | **Speed Critical** | Groq | 10x faster inference, sub-second responses | | **Privacy Critical** | Ollama | 100% local, no data leaves your servers | | **Real-time Data** | XAI | Access to current information | @@ -454,6 +500,7 @@ export PERPLEXITY_API_KEY=pplx-... ## 📈 Roadmap +- [x] Research & Web Search API - [ ] Streaming function calls - [ ] Image generation support - [ ] Voice input processing diff --git a/readme.research.md b/readme.research.md deleted file mode 100644 index 996dcd4..0000000 --- a/readme.research.md +++ /dev/null @@ -1,177 +0,0 @@ -# SmartAI Research API Implementation - -This document describes the new research capabilities added to the SmartAI library, enabling web search and deep research features for OpenAI and Anthropic providers. - -## Features Added - -### 1. Research Method Interface - -Added a new `research()` method to the `MultiModalModel` abstract class with the following interfaces: - -```typescript -interface ResearchOptions { - query: string; - searchDepth?: 'basic' | 'advanced' | 'deep'; - maxSources?: number; - includeWebSearch?: boolean; - background?: boolean; -} - -interface ResearchResponse { - answer: string; - sources: Array<{ - url: string; - title: string; - snippet: string; - }>; - searchQueries?: string[]; - metadata?: any; -} -``` - -### 2. OpenAI Provider Research Implementation - -The OpenAI provider now supports: -- **Deep Research API** with models: - - `o3-deep-research-2025-06-26` (comprehensive analysis) - - `o4-mini-deep-research-2025-06-26` (lightweight, faster) -- **Web Search** for standard models (gpt-5, o3, o3-pro, o4-mini) -- **Background processing** for async deep research tasks - -### 3. Anthropic Provider Research Implementation - -The Anthropic provider now supports: -- **Web Search API** with Claude models -- **Domain filtering** (allow/block lists) -- **Progressive searches** for comprehensive research -- **Citation extraction** from responses - -### 4. Perplexity Provider Research Implementation - -The Perplexity provider implements research using: -- **Sonar models** for standard searches -- **Sonar Pro** for deep research -- Built-in citation support - -### 5. Other Providers - -Added research method stubs to: -- Groq Provider -- Ollama Provider -- xAI Provider -- Exo Provider - -These providers throw a "not yet supported" error when research is called, maintaining interface compatibility. - -## Usage Examples - -### Basic Research with OpenAI - -```typescript -import { OpenAiProvider } from '@push.rocks/smartai'; - -const openai = new OpenAiProvider({ - openaiToken: 'your-api-key', - researchModel: 'o4-mini-deep-research-2025-06-26' -}); - -await openai.start(); - -const result = await openai.research({ - query: 'What are the latest developments in quantum computing?', - searchDepth: 'basic', - includeWebSearch: true -}); - -console.log(result.answer); -console.log('Sources:', result.sources); -``` - -### Deep Research with OpenAI - -```typescript -const deepResult = await openai.research({ - query: 'Comprehensive analysis of climate change mitigation strategies', - searchDepth: 'deep', - background: true -}); -``` - -### Research with Anthropic - -```typescript -import { AnthropicProvider } from '@push.rocks/smartai'; - -const anthropic = new AnthropicProvider({ - anthropicToken: 'your-api-key', - enableWebSearch: true, - searchDomainAllowList: ['nature.com', 'science.org'] -}); - -await anthropic.start(); - -const result = await anthropic.research({ - query: 'Latest breakthroughs in CRISPR gene editing', - searchDepth: 'advanced' -}); -``` - -### Research with Perplexity - -```typescript -import { PerplexityProvider } from '@push.rocks/smartai'; - -const perplexity = new PerplexityProvider({ - perplexityToken: 'your-api-key' -}); - -const result = await perplexity.research({ - query: 'Current state of autonomous vehicle technology', - searchDepth: 'deep' // Uses Sonar Pro model -}); -``` - -## Configuration Options - -### OpenAI Provider -- `researchModel`: Specify deep research model (default: `o4-mini-deep-research-2025-06-26`) -- `enableWebSearch`: Enable web search for standard models - -### Anthropic Provider -- `enableWebSearch`: Enable web search capabilities -- `searchDomainAllowList`: Array of allowed domains -- `searchDomainBlockList`: Array of blocked domains - -## API Pricing - -- **OpenAI Deep Research**: $10 per 1,000 calls -- **Anthropic Web Search**: $10 per 1,000 searches + standard token costs -- **Perplexity Sonar**: $5 per 1,000 searches (Sonar Pro) - -## Testing - -Run the test suite: - -```bash -pnpm test test/test.research.ts -``` - -All providers have been tested to ensure: -- Research methods are properly exposed -- Interfaces are correctly typed -- Unsupported providers throw appropriate errors - -## Next Steps - -Future enhancements could include: -1. Implementing Google Gemini Grounding API support -2. Adding Brave Search API integration -3. Implementing retry logic for rate limits -4. Adding caching for repeated queries -5. Supporting batch research operations - -## Notes - -- The implementation maintains backward compatibility -- All existing methods continue to work unchanged -- Research capabilities are optional and don't affect existing functionality \ No newline at end of file diff --git a/test/test.image.openai.ts b/test/test.image.openai.ts new file mode 100644 index 0000000..3fc8ade --- /dev/null +++ b/test/test.image.openai.ts @@ -0,0 +1,203 @@ +import { expect, tap } from '@push.rocks/tapbundle'; +import * as qenv from '@push.rocks/qenv'; +import * as smartai from '../ts/index.js'; +import * as path from 'path'; +import { promises as fs } from 'fs'; + +const testQenv = new qenv.Qenv('./', './.nogit/'); + +let openaiProvider: smartai.OpenAiProvider; + +// Helper function to save image results +async function saveImageResult(testName: string, result: any) { + const sanitizedName = testName.replace(/[^a-z0-9]/gi, '_').toLowerCase(); + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `openai_${sanitizedName}_${timestamp}.json`; + const filepath = path.join('.nogit', 'testresults', 'images', filename); + + await fs.mkdir(path.dirname(filepath), { recursive: true }); + await fs.writeFile(filepath, JSON.stringify(result, null, 2), 'utf-8'); + + console.log(` 💾 Saved to: ${filepath}`); + + // Also save the actual image if b64_json is present + if (result.images && result.images[0]?.b64_json) { + const imageFilename = `openai_${sanitizedName}_${timestamp}.png`; + const imageFilepath = path.join('.nogit', 'testresults', 'images', imageFilename); + await fs.writeFile(imageFilepath, Buffer.from(result.images[0].b64_json, 'base64')); + console.log(` 🖼️ Image saved to: ${imageFilepath}`); + } +} + +tap.test('OpenAI Image Generation: should initialize provider', async () => { + const openaiToken = await testQenv.getEnvVarOnDemand('OPENAI_TOKEN'); + expect(openaiToken).toBeTruthy(); + + openaiProvider = new smartai.OpenAiProvider({ + openaiToken, + imageModel: 'gpt-image-1' + }); + + await openaiProvider.start(); + expect(openaiProvider).toBeInstanceOf(smartai.OpenAiProvider); +}); + +tap.test('OpenAI Image: Basic generation with gpt-image-1', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'A cute robot reading a book in a cozy library, digital art style', + model: 'gpt-image-1', + quality: 'medium', + size: '1024x1024' + }); + + console.log('Basic gpt-image-1 Generation:'); + console.log('- Images generated:', result.images.length); + console.log('- Model used:', result.metadata?.model); + console.log('- Quality:', result.metadata?.quality); + console.log('- Size:', result.metadata?.size); + console.log('- Tokens used:', result.metadata?.tokensUsed); + + await saveImageResult('basic_generation_gptimage1', result); + + expect(result.images).toBeTruthy(); + expect(result.images.length).toEqual(1); + expect(result.images[0].b64_json).toBeTruthy(); + expect(result.metadata?.model).toEqual('gpt-image-1'); +}); + +tap.test('OpenAI Image: High quality with transparent background', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'A simple geometric logo of a mountain peak, minimal design, clean lines', + model: 'gpt-image-1', + quality: 'high', + size: '1024x1024', + background: 'transparent', + outputFormat: 'png' + }); + + console.log('High Quality Transparent:'); + console.log('- Quality:', result.metadata?.quality); + console.log('- Background: transparent'); + console.log('- Format:', result.metadata?.outputFormat); + console.log('- Tokens used:', result.metadata?.tokensUsed); + + await saveImageResult('high_quality_transparent', result); + + expect(result.images.length).toEqual(1); + expect(result.images[0].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image: WebP format with compression', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'A futuristic cityscape at sunset with flying cars, photorealistic', + model: 'gpt-image-1', + quality: 'high', + size: '1536x1024', + outputFormat: 'webp', + outputCompression: 85 + }); + + console.log('WebP with Compression:'); + console.log('- Format:', result.metadata?.outputFormat); + console.log('- Compression: 85%'); + console.log('- Size:', result.metadata?.size); + + await saveImageResult('webp_compression', result); + + expect(result.images.length).toEqual(1); + expect(result.images[0].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image: Text rendering with gpt-image-1', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'A vintage cafe sign that says "COFFEE & CODE" in elegant hand-lettered typography, warm colors', + model: 'gpt-image-1', + quality: 'high', + size: '1024x1024' + }); + + console.log('Text Rendering:'); + console.log('- Prompt includes text: "COFFEE & CODE"'); + console.log('- gpt-image-1 has superior text rendering'); + console.log('- Tokens used:', result.metadata?.tokensUsed); + + await saveImageResult('text_rendering', result); + + expect(result.images.length).toEqual(1); + expect(result.images[0].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image: Multiple images generation', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'Abstract colorful geometric patterns, modern minimalist art', + model: 'gpt-image-1', + n: 2, + quality: 'medium', + size: '1024x1024' + }); + + console.log('Multiple Images:'); + console.log('- Images requested: 2'); + console.log('- Images generated:', result.images.length); + + await saveImageResult('multiple_images', result); + + expect(result.images.length).toEqual(2); + expect(result.images[0].b64_json).toBeTruthy(); + expect(result.images[1].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image: Low moderation setting', async () => { + const result = await openaiProvider.imageGenerate({ + prompt: 'A fantasy battle scene with warriors and dragons', + model: 'gpt-image-1', + moderation: 'low', + quality: 'medium' + }); + + console.log('Low Moderation:'); + console.log('- Moderation: low (less restrictive filtering)'); + console.log('- Tokens used:', result.metadata?.tokensUsed); + + await saveImageResult('low_moderation', result); + + expect(result.images.length).toEqual(1); + expect(result.images[0].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image Editing: edit with gpt-image-1', async () => { + // First, generate a base image + const baseResult = await openaiProvider.imageGenerate({ + prompt: 'A simple white cat sitting on a red cushion', + model: 'gpt-image-1', + quality: 'low', + size: '1024x1024' + }); + + const baseImageBuffer = Buffer.from(baseResult.images[0].b64_json!, 'base64'); + + // Now edit it + const editResult = await openaiProvider.imageEdit({ + image: baseImageBuffer, + prompt: 'Change the cat to orange and add stylish sunglasses', + model: 'gpt-image-1', + quality: 'medium' + }); + + console.log('Image Editing:'); + console.log('- Base image created'); + console.log('- Edit: change color and add sunglasses'); + console.log('- Result images:', editResult.images.length); + + await saveImageResult('image_edit', editResult); + + expect(editResult.images.length).toEqual(1); + expect(editResult.images[0].b64_json).toBeTruthy(); +}); + +tap.test('OpenAI Image: should clean up provider', async () => { + await openaiProvider.stop(); + console.log('OpenAI image provider stopped successfully'); +}); + +export default tap.start(); diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 7c6ee67..c5bbcb9 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@push.rocks/smartai', - version: '0.6.1', + version: '0.7.0', description: 'SmartAi is a versatile TypeScript library designed to facilitate integration and interaction with various AI models, offering functionalities for chat, audio generation, document processing, and vision tasks.' } diff --git a/ts/abstract.classes.multimodal.ts b/ts/abstract.classes.multimodal.ts index 28b8ca4..990205c 100644 --- a/ts/abstract.classes.multimodal.ts +++ b/ts/abstract.classes.multimodal.ts @@ -50,6 +50,60 @@ export interface ResearchResponse { metadata?: any; } +/** + * Options for image generation + */ +export interface ImageGenerateOptions { + prompt: string; + model?: 'gpt-image-1' | 'dall-e-3' | 'dall-e-2'; + quality?: 'low' | 'medium' | 'high' | 'standard' | 'hd' | 'auto'; + size?: '256x256' | '512x512' | '1024x1024' | '1536x1024' | '1024x1536' | '1792x1024' | '1024x1792' | 'auto'; + style?: 'vivid' | 'natural'; + background?: 'transparent' | 'opaque' | 'auto'; + outputFormat?: 'png' | 'jpeg' | 'webp'; + outputCompression?: number; // 0-100 for webp/jpeg + moderation?: 'low' | 'auto'; + n?: number; // Number of images to generate + stream?: boolean; + partialImages?: number; // 0-3 for streaming +} + +/** + * Options for image editing + */ +export interface ImageEditOptions { + image: Buffer; + prompt: string; + mask?: Buffer; + model?: 'gpt-image-1' | 'dall-e-2'; + quality?: 'low' | 'medium' | 'high' | 'standard' | 'auto'; + size?: '256x256' | '512x512' | '1024x1024' | '1536x1024' | '1024x1536' | 'auto'; + background?: 'transparent' | 'opaque' | 'auto'; + outputFormat?: 'png' | 'jpeg' | 'webp'; + outputCompression?: number; + n?: number; + stream?: boolean; + partialImages?: number; +} + +/** + * Response format for image operations + */ +export interface ImageResponse { + images: Array<{ + b64_json?: string; + url?: string; + revisedPrompt?: string; + }>; + metadata?: { + model: string; + quality?: string; + size?: string; + outputFormat?: string; + tokensUsed?: number; + }; +} + /** * Abstract base class for multi-modal AI models. * Provides a common interface for different AI providers (OpenAI, Anthropic, Perplexity, Ollama) @@ -131,4 +185,20 @@ export abstract class MultiModalModel { * @throws Error if the provider doesn't support research capabilities */ public abstract research(optionsArg: ResearchOptions): Promise; + + /** + * Image generation from text prompts + * @param optionsArg Options containing the prompt and generation parameters + * @returns Promise resolving to the generated image(s) + * @throws Error if the provider doesn't support image generation + */ + public abstract imageGenerate(optionsArg: ImageGenerateOptions): Promise; + + /** + * Image editing and inpainting + * @param optionsArg Options containing the image, prompt, and editing parameters + * @returns Promise resolving to the edited image(s) + * @throws Error if the provider doesn't support image editing + */ + public abstract imageEdit(optionsArg: ImageEditOptions): Promise; } diff --git a/ts/provider.anthropic.ts b/ts/provider.anthropic.ts index 74a78c4..812d1ed 100644 --- a/ts/provider.anthropic.ts +++ b/ts/provider.anthropic.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages'; type ContentBlock = ImageBlockParam | TextBlockParam; @@ -379,4 +388,18 @@ export class AnthropicProvider extends MultiModalModel { throw new Error(`Failed to perform research: ${error.message}`); } } + + /** + * Image generation is not supported by Anthropic + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by Anthropic. Claude can only analyze images, not generate them. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by Anthropic + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by Anthropic. Claude can only analyze images, not edit them. Please use OpenAI provider for image editing.'); + } } \ No newline at end of file diff --git a/ts/provider.exo.ts b/ts/provider.exo.ts index ed6e416..207e747 100644 --- a/ts/provider.exo.ts +++ b/ts/provider.exo.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions'; export interface IExoProviderOptions { @@ -129,4 +138,18 @@ export class ExoProvider extends MultiModalModel { public async research(optionsArg: ResearchOptions): Promise { throw new Error('Research capabilities are not yet supported by Exo provider.'); } + + /** + * Image generation is not supported by Exo + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by Exo. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by Exo + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by Exo. Please use OpenAI provider for image editing.'); + } } diff --git a/ts/provider.groq.ts b/ts/provider.groq.ts index ffb355e..87e7339 100644 --- a/ts/provider.groq.ts +++ b/ts/provider.groq.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; export interface IGroqProviderOptions { groqToken: string; @@ -193,4 +202,18 @@ export class GroqProvider extends MultiModalModel { public async research(optionsArg: ResearchOptions): Promise { throw new Error('Research capabilities are not yet supported by Groq provider.'); } + + /** + * Image generation is not supported by Groq + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by Groq. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by Groq + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by Groq. Please use OpenAI provider for image editing.'); + } } \ No newline at end of file diff --git a/ts/provider.ollama.ts b/ts/provider.ollama.ts index bb71d73..e1f588e 100644 --- a/ts/provider.ollama.ts +++ b/ts/provider.ollama.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; export interface IOllamaProviderOptions { baseUrl?: string; @@ -255,4 +264,18 @@ export class OllamaProvider extends MultiModalModel { public async research(optionsArg: ResearchOptions): Promise { throw new Error('Research capabilities are not yet supported by Ollama provider.'); } + + /** + * Image generation is not supported by Ollama + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by Ollama. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by Ollama + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by Ollama. Please use OpenAI provider for image editing.'); + } } \ No newline at end of file diff --git a/ts/provider.openai.ts b/ts/provider.openai.ts index 4c2f28b..47a1614 100644 --- a/ts/provider.openai.ts +++ b/ts/provider.openai.ts @@ -9,7 +9,13 @@ export type TChatCompletionRequestMessage = { }; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; export interface IOpenaiProviderOptions { openaiToken: string; @@ -17,6 +23,7 @@ export interface IOpenaiProviderOptions { audioModel?: string; visionModel?: string; researchModel?: string; + imageModel?: string; enableWebSearch?: boolean; } @@ -328,4 +335,121 @@ export class OpenAiProvider extends MultiModalModel { throw new Error(`Failed to perform research: ${error.message}`); } } + + /** + * Image generation using OpenAI's gpt-image-1 or DALL-E models + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + const model = optionsArg.model || this.options.imageModel || 'gpt-image-1'; + + try { + const requestParams: any = { + model, + prompt: optionsArg.prompt, + n: optionsArg.n || 1, + }; + + // Add gpt-image-1 specific parameters + if (model === 'gpt-image-1') { + if (optionsArg.quality) requestParams.quality = optionsArg.quality; + if (optionsArg.size) requestParams.size = optionsArg.size; + if (optionsArg.background) requestParams.background = optionsArg.background; + if (optionsArg.outputFormat) requestParams.output_format = optionsArg.outputFormat; + if (optionsArg.outputCompression !== undefined) requestParams.output_compression = optionsArg.outputCompression; + if (optionsArg.moderation) requestParams.moderation = optionsArg.moderation; + if (optionsArg.stream !== undefined) requestParams.stream = optionsArg.stream; + if (optionsArg.partialImages !== undefined) requestParams.partial_images = optionsArg.partialImages; + } else if (model === 'dall-e-3') { + // DALL-E 3 specific parameters + if (optionsArg.quality) requestParams.quality = optionsArg.quality; + if (optionsArg.size) requestParams.size = optionsArg.size; + if (optionsArg.style) requestParams.style = optionsArg.style; + requestParams.response_format = 'b64_json'; // Always use base64 for consistency + } else if (model === 'dall-e-2') { + // DALL-E 2 specific parameters + if (optionsArg.size) requestParams.size = optionsArg.size; + requestParams.response_format = 'b64_json'; + } + + const result = await this.openAiApiClient.images.generate(requestParams); + + const images = (result.data || []).map(img => ({ + b64_json: img.b64_json, + url: img.url, + revisedPrompt: img.revised_prompt + })); + + return { + images, + metadata: { + model, + quality: result.quality, + size: result.size, + outputFormat: result.output_format, + tokensUsed: result.usage?.total_tokens + } + }; + } catch (error) { + console.error('Image generation error:', error); + throw new Error(`Failed to generate image: ${error.message}`); + } + } + + /** + * Image editing using OpenAI's gpt-image-1 or DALL-E 2 models + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + const model = optionsArg.model || this.options.imageModel || 'gpt-image-1'; + + try { + const requestParams: any = { + model, + image: optionsArg.image, + prompt: optionsArg.prompt, + n: optionsArg.n || 1, + }; + + // Add mask if provided + if (optionsArg.mask) { + requestParams.mask = optionsArg.mask; + } + + // Add gpt-image-1 specific parameters + if (model === 'gpt-image-1') { + if (optionsArg.quality) requestParams.quality = optionsArg.quality; + if (optionsArg.size) requestParams.size = optionsArg.size; + if (optionsArg.background) requestParams.background = optionsArg.background; + if (optionsArg.outputFormat) requestParams.output_format = optionsArg.outputFormat; + if (optionsArg.outputCompression !== undefined) requestParams.output_compression = optionsArg.outputCompression; + if (optionsArg.stream !== undefined) requestParams.stream = optionsArg.stream; + if (optionsArg.partialImages !== undefined) requestParams.partial_images = optionsArg.partialImages; + } else if (model === 'dall-e-2') { + // DALL-E 2 specific parameters + if (optionsArg.size) requestParams.size = optionsArg.size; + requestParams.response_format = 'b64_json'; + } + + const result = await this.openAiApiClient.images.edit(requestParams); + + const images = (result.data || []).map(img => ({ + b64_json: img.b64_json, + url: img.url, + revisedPrompt: img.revised_prompt + })); + + return { + images, + metadata: { + model, + quality: result.quality, + size: result.size, + outputFormat: result.output_format, + tokensUsed: result.usage?.total_tokens + } + }; + } catch (error) { + console.error('Image edit error:', error); + throw new Error(`Failed to edit image: ${error.message}`); + } + } } \ No newline at end of file diff --git a/ts/provider.perplexity.ts b/ts/provider.perplexity.ts index 049138d..8a08960 100644 --- a/ts/provider.perplexity.ts +++ b/ts/provider.perplexity.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; export interface IPerplexityProviderOptions { perplexityToken: string; @@ -233,4 +242,18 @@ export class PerplexityProvider extends MultiModalModel { throw new Error(`Failed to perform research: ${error.message}`); } } + + /** + * Image generation is not supported by Perplexity + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by Perplexity. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by Perplexity + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by Perplexity. Please use OpenAI provider for image editing.'); + } } \ No newline at end of file diff --git a/ts/provider.xai.ts b/ts/provider.xai.ts index 8801ada..ea6c0f3 100644 --- a/ts/provider.xai.ts +++ b/ts/provider.xai.ts @@ -1,7 +1,16 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import { MultiModalModel } from './abstract.classes.multimodal.js'; -import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js'; +import type { + ChatOptions, + ChatResponse, + ChatMessage, + ResearchOptions, + ResearchResponse, + ImageGenerateOptions, + ImageEditOptions, + ImageResponse +} from './abstract.classes.multimodal.js'; import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions'; export interface IXAIProviderOptions { @@ -185,4 +194,18 @@ export class XAIProvider extends MultiModalModel { public async research(optionsArg: ResearchOptions): Promise { throw new Error('Research capabilities are not yet supported by xAI provider.'); } + + /** + * Image generation is not supported by xAI + */ + public async imageGenerate(optionsArg: ImageGenerateOptions): Promise { + throw new Error('Image generation is not supported by xAI. Please use OpenAI provider for image generation.'); + } + + /** + * Image editing is not supported by xAI + */ + public async imageEdit(optionsArg: ImageEditOptions): Promise { + throw new Error('Image editing is not supported by xAI. Please use OpenAI provider for image editing.'); + } }