feat(providers): Add research API and image generation/editing support; extend providers and tests

2025-10-03 13:43:29 +00:00
parent fe8540c8ba
commit f0556e89f3
13 changed files with 612 additions and 194 deletions
--- a/ts/00_commitinfo_data.ts
+++ b/ts/00_commitinfo_data.ts
@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: '@push.rocks/smartai',
-  version: '0.6.1',
+  version: '0.7.0',
  description: 'SmartAi is a versatile TypeScript library designed to facilitate integration and interaction with various AI models, offering functionalities for chat, audio generation, document processing, and vision tasks.'
 }
--- a/ts/abstract.classes.multimodal.ts
+++ b/ts/abstract.classes.multimodal.ts
@@ -50,6 +50,60 @@ export interface ResearchResponse {
  metadata?: any;
 }

+/**
+ * Options for image generation
+ */
+export interface ImageGenerateOptions {
+  prompt: string;
+  model?: 'gpt-image-1' | 'dall-e-3' | 'dall-e-2';
+  quality?: 'low' | 'medium' | 'high' | 'standard' | 'hd' | 'auto';
+  size?: '256x256' | '512x512' | '1024x1024' | '1536x1024' | '1024x1536' | '1792x1024' | '1024x1792' | 'auto';
+  style?: 'vivid' | 'natural';
+  background?: 'transparent' | 'opaque' | 'auto';
+  outputFormat?: 'png' | 'jpeg' | 'webp';
+  outputCompression?: number; // 0-100 for webp/jpeg
+  moderation?: 'low' | 'auto';
+  n?: number; // Number of images to generate
+  stream?: boolean;
+  partialImages?: number; // 0-3 for streaming
+}
+
+/**
+ * Options for image editing
+ */
+export interface ImageEditOptions {
+  image: Buffer;
+  prompt: string;
+  mask?: Buffer;
+  model?: 'gpt-image-1' | 'dall-e-2';
+  quality?: 'low' | 'medium' | 'high' | 'standard' | 'auto';
+  size?: '256x256' | '512x512' | '1024x1024' | '1536x1024' | '1024x1536' | 'auto';
+  background?: 'transparent' | 'opaque' | 'auto';
+  outputFormat?: 'png' | 'jpeg' | 'webp';
+  outputCompression?: number;
+  n?: number;
+  stream?: boolean;
+  partialImages?: number;
+}
+
+/**
+ * Response format for image operations
+ */
+export interface ImageResponse {
+  images: Array<{
+    b64_json?: string;
+    url?: string;
+    revisedPrompt?: string;
+  }>;
+  metadata?: {
+    model: string;
+    quality?: string;
+    size?: string;
+    outputFormat?: string;
+    tokensUsed?: number;
+  };
+}
+
 /**
 * Abstract base class for multi-modal AI models.
 * Provides a common interface for different AI providers (OpenAI, Anthropic, Perplexity, Ollama)
@@ -131,4 +185,20 @@ export abstract class MultiModalModel {
   * @throws Error if the provider doesn't support research capabilities
   */
  public abstract research(optionsArg: ResearchOptions): Promise<ResearchResponse>;
+
+  /**
+   * Image generation from text prompts
+   * @param optionsArg Options containing the prompt and generation parameters
+   * @returns Promise resolving to the generated image(s)
+   * @throws Error if the provider doesn't support image generation
+   */
+  public abstract imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse>;
+
+  /**
+   * Image editing and inpainting
+   * @param optionsArg Options containing the image, prompt, and editing parameters
+   * @returns Promise resolving to the edited image(s)
+   * @throws Error if the provider doesn't support image editing
+   */
+  public abstract imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse>;
 }
--- a/ts/provider.anthropic.ts
+++ b/ts/provider.anthropic.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';
 import type { ImageBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources/messages';

 type ContentBlock = ImageBlockParam | TextBlockParam;
@@ -379,4 +388,18 @@ export class AnthropicProvider extends MultiModalModel {
      throw new Error(`Failed to perform research: ${error.message}`);
    }
  }
+
+  /**
+   * Image generation is not supported by Anthropic
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by Anthropic. Claude can only analyze images, not generate them. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by Anthropic
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by Anthropic. Claude can only analyze images, not edit them. Please use OpenAI provider for image editing.');
+  }
 }
--- a/ts/provider.exo.ts
+++ b/ts/provider.exo.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';
 import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions';

 export interface IExoProviderOptions {
@@ -129,4 +138,18 @@ export class ExoProvider extends MultiModalModel {
  public async research(optionsArg: ResearchOptions): Promise<ResearchResponse> {
    throw new Error('Research capabilities are not yet supported by Exo provider.');
  }
+
+  /**
+   * Image generation is not supported by Exo
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by Exo. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by Exo
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by Exo. Please use OpenAI provider for image editing.');
+  }
 }
--- a/ts/provider.groq.ts
+++ b/ts/provider.groq.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';

 export interface IGroqProviderOptions {
  groqToken: string;
@@ -193,4 +202,18 @@ export class GroqProvider extends MultiModalModel {
  public async research(optionsArg: ResearchOptions): Promise<ResearchResponse> {
    throw new Error('Research capabilities are not yet supported by Groq provider.');
  }
+
+  /**
+   * Image generation is not supported by Groq
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by Groq. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by Groq
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by Groq. Please use OpenAI provider for image editing.');
+  }
 }
--- a/ts/provider.ollama.ts
+++ b/ts/provider.ollama.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';

 export interface IOllamaProviderOptions {
  baseUrl?: string;
@@ -255,4 +264,18 @@ export class OllamaProvider extends MultiModalModel {
  public async research(optionsArg: ResearchOptions): Promise<ResearchResponse> {
    throw new Error('Research capabilities are not yet supported by Ollama provider.');
  }
+
+  /**
+   * Image generation is not supported by Ollama
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by Ollama. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by Ollama
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by Ollama. Please use OpenAI provider for image editing.');
+  }
 }
--- a/ts/provider.openai.ts
+++ b/ts/provider.openai.ts
@@ -9,7 +9,13 @@ export type TChatCompletionRequestMessage = {
 };

 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';

 export interface IOpenaiProviderOptions {
  openaiToken: string;
@@ -17,6 +23,7 @@ export interface IOpenaiProviderOptions {
  audioModel?: string;
  visionModel?: string;
  researchModel?: string;
+  imageModel?: string;
  enableWebSearch?: boolean;
 }

@@ -328,4 +335,121 @@ export class OpenAiProvider extends MultiModalModel {
      throw new Error(`Failed to perform research: ${error.message}`);
    }
  }
+
+  /**
+   * Image generation using OpenAI's gpt-image-1 or DALL-E models
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    const model = optionsArg.model || this.options.imageModel || 'gpt-image-1';
+
+    try {
+      const requestParams: any = {
+        model,
+        prompt: optionsArg.prompt,
+        n: optionsArg.n || 1,
+      };
+
+      // Add gpt-image-1 specific parameters
+      if (model === 'gpt-image-1') {
+        if (optionsArg.quality) requestParams.quality = optionsArg.quality;
+        if (optionsArg.size) requestParams.size = optionsArg.size;
+        if (optionsArg.background) requestParams.background = optionsArg.background;
+        if (optionsArg.outputFormat) requestParams.output_format = optionsArg.outputFormat;
+        if (optionsArg.outputCompression !== undefined) requestParams.output_compression = optionsArg.outputCompression;
+        if (optionsArg.moderation) requestParams.moderation = optionsArg.moderation;
+        if (optionsArg.stream !== undefined) requestParams.stream = optionsArg.stream;
+        if (optionsArg.partialImages !== undefined) requestParams.partial_images = optionsArg.partialImages;
+      } else if (model === 'dall-e-3') {
+        // DALL-E 3 specific parameters
+        if (optionsArg.quality) requestParams.quality = optionsArg.quality;
+        if (optionsArg.size) requestParams.size = optionsArg.size;
+        if (optionsArg.style) requestParams.style = optionsArg.style;
+        requestParams.response_format = 'b64_json'; // Always use base64 for consistency
+      } else if (model === 'dall-e-2') {
+        // DALL-E 2 specific parameters
+        if (optionsArg.size) requestParams.size = optionsArg.size;
+        requestParams.response_format = 'b64_json';
+      }
+
+      const result = await this.openAiApiClient.images.generate(requestParams);
+
+      const images = (result.data || []).map(img => ({
+        b64_json: img.b64_json,
+        url: img.url,
+        revisedPrompt: img.revised_prompt
+      }));
+
+      return {
+        images,
+        metadata: {
+          model,
+          quality: result.quality,
+          size: result.size,
+          outputFormat: result.output_format,
+          tokensUsed: result.usage?.total_tokens
+        }
+      };
+    } catch (error) {
+      console.error('Image generation error:', error);
+      throw new Error(`Failed to generate image: ${error.message}`);
+    }
+  }
+
+  /**
+   * Image editing using OpenAI's gpt-image-1 or DALL-E 2 models
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    const model = optionsArg.model || this.options.imageModel || 'gpt-image-1';
+
+    try {
+      const requestParams: any = {
+        model,
+        image: optionsArg.image,
+        prompt: optionsArg.prompt,
+        n: optionsArg.n || 1,
+      };
+
+      // Add mask if provided
+      if (optionsArg.mask) {
+        requestParams.mask = optionsArg.mask;
+      }
+
+      // Add gpt-image-1 specific parameters
+      if (model === 'gpt-image-1') {
+        if (optionsArg.quality) requestParams.quality = optionsArg.quality;
+        if (optionsArg.size) requestParams.size = optionsArg.size;
+        if (optionsArg.background) requestParams.background = optionsArg.background;
+        if (optionsArg.outputFormat) requestParams.output_format = optionsArg.outputFormat;
+        if (optionsArg.outputCompression !== undefined) requestParams.output_compression = optionsArg.outputCompression;
+        if (optionsArg.stream !== undefined) requestParams.stream = optionsArg.stream;
+        if (optionsArg.partialImages !== undefined) requestParams.partial_images = optionsArg.partialImages;
+      } else if (model === 'dall-e-2') {
+        // DALL-E 2 specific parameters
+        if (optionsArg.size) requestParams.size = optionsArg.size;
+        requestParams.response_format = 'b64_json';
+      }
+
+      const result = await this.openAiApiClient.images.edit(requestParams);
+
+      const images = (result.data || []).map(img => ({
+        b64_json: img.b64_json,
+        url: img.url,
+        revisedPrompt: img.revised_prompt
+      }));
+
+      return {
+        images,
+        metadata: {
+          model,
+          quality: result.quality,
+          size: result.size,
+          outputFormat: result.output_format,
+          tokensUsed: result.usage?.total_tokens
+        }
+      };
+    } catch (error) {
+      console.error('Image edit error:', error);
+      throw new Error(`Failed to edit image: ${error.message}`);
+    }
+  }
 }
--- a/ts/provider.perplexity.ts
+++ b/ts/provider.perplexity.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';

 export interface IPerplexityProviderOptions {
  perplexityToken: string;
@@ -233,4 +242,18 @@ export class PerplexityProvider extends MultiModalModel {
      throw new Error(`Failed to perform research: ${error.message}`);
    }
  }
+
+  /**
+   * Image generation is not supported by Perplexity
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by Perplexity. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by Perplexity
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by Perplexity. Please use OpenAI provider for image editing.');
+  }
 }
--- a/ts/provider.xai.ts
+++ b/ts/provider.xai.ts
@@ -1,7 +1,16 @@
 import * as plugins from './plugins.js';
 import * as paths from './paths.js';
 import { MultiModalModel } from './abstract.classes.multimodal.js';
-import type { ChatOptions, ChatResponse, ChatMessage, ResearchOptions, ResearchResponse } from './abstract.classes.multimodal.js';
+import type {
+  ChatOptions,
+  ChatResponse,
+  ChatMessage,
+  ResearchOptions,
+  ResearchResponse,
+  ImageGenerateOptions,
+  ImageEditOptions,
+  ImageResponse
+} from './abstract.classes.multimodal.js';
 import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions';

 export interface IXAIProviderOptions {
@@ -185,4 +194,18 @@ export class XAIProvider extends MultiModalModel {
  public async research(optionsArg: ResearchOptions): Promise<ResearchResponse> {
    throw new Error('Research capabilities are not yet supported by xAI provider.');
  }
+
+  /**
+   * Image generation is not supported by xAI
+   */
+  public async imageGenerate(optionsArg: ImageGenerateOptions): Promise<ImageResponse> {
+    throw new Error('Image generation is not supported by xAI. Please use OpenAI provider for image generation.');
+  }
+
+  /**
+   * Image editing is not supported by xAI
+   */
+  public async imageEdit(optionsArg: ImageEditOptions): Promise<ImageResponse> {
+    throw new Error('Image editing is not supported by xAI. Please use OpenAI provider for image editing.');
+  }
 }