Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a7ae676184 | |||
| 269e948453 | |||
| 1d64ee3edb | |||
| 4725b55566 | |||
| 0e2053f538 | |||
| c8f98b3364 |
@@ -3,6 +3,37 @@
|
|||||||
## Pending
|
## Pending
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 2026-05-18 - 4.1.0
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- add Mistral OCR engine with package export, tests, and documentation (ocr)
|
||||||
|
- introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint
|
||||||
|
- adds OCR request/response types, configurable transport and options, and normalized page/confidence results
|
||||||
|
- includes mocked transport tests for OCR requests and input validation
|
||||||
|
- updates package metadata and README content to document the new OCR module
|
||||||
|
|
||||||
|
## 2026-05-14 - 4.0.2
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
- strip unsupported ChatGPT prompt cache retention options while preserving prompt cache keys (openai)
|
||||||
|
- Removes promptCacheRetention values before sending requests to the ChatGPT Codex backend.
|
||||||
|
- Keeps prompt_cache_key forwarding intact for OpenAI provider options.
|
||||||
|
- Only rewrites system prompts into top-level instructions when needed, avoiding unnecessary prompt changes.
|
||||||
|
|
||||||
|
## 2026-05-14 - 4.0.1
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
- map system prompts to top-level instructions for ChatGPT auth requests (openai)
|
||||||
|
- wrap OpenAI models using ChatGPT auth with middleware that extracts system messages into provider instructions
|
||||||
|
- remove system messages from the serialized prompt payload to match the ChatGPT Codex backend expectations
|
||||||
|
- add test coverage to verify authorization headers, workspace routing, and instruction payload mapping
|
||||||
|
|
||||||
## 2026-05-14 - 4.0.0
|
## 2026-05-14 - 4.0.0
|
||||||
|
|
||||||
### Breaking Changes
|
### Breaking Changes
|
||||||
|
|||||||
+7
-2
@@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
"name": "@push.rocks/smartai",
|
"name": "@push.rocks/smartai",
|
||||||
"version": "4.0.0",
|
"version": "4.1.0",
|
||||||
"private": false,
|
"private": false,
|
||||||
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.",
|
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.",
|
||||||
"main": "dist_ts/index.js",
|
"main": "dist_ts/index.js",
|
||||||
"typings": "dist_ts/index.d.ts",
|
"typings": "dist_ts/index.d.ts",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
@@ -27,6 +27,10 @@
|
|||||||
"import": "./dist_ts_document/index.js",
|
"import": "./dist_ts_document/index.js",
|
||||||
"types": "./dist_ts_document/index.d.ts"
|
"types": "./dist_ts_document/index.d.ts"
|
||||||
},
|
},
|
||||||
|
"./ocr": {
|
||||||
|
"import": "./dist_ts_ocr/index.js",
|
||||||
|
"types": "./dist_ts_ocr/index.d.ts"
|
||||||
|
},
|
||||||
"./research": {
|
"./research": {
|
||||||
"import": "./dist_ts_research/index.js",
|
"import": "./dist_ts_research/index.js",
|
||||||
"types": "./dist_ts_research/index.d.ts"
|
"types": "./dist_ts_research/index.d.ts"
|
||||||
@@ -87,6 +91,7 @@
|
|||||||
"ts_audio/**/*",
|
"ts_audio/**/*",
|
||||||
"ts_image/**/*",
|
"ts_image/**/*",
|
||||||
"ts_document/**/*",
|
"ts_document/**/*",
|
||||||
|
"ts_ocr/**/*",
|
||||||
"ts_research/**/*",
|
"ts_research/**/*",
|
||||||
"ts_openai_chatgpt_auth/**/*",
|
"ts_openai_chatgpt_auth/**/*",
|
||||||
"dist_*/**/*",
|
"dist_*/**/*",
|
||||||
|
|||||||
+2
-1
@@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
|
|||||||
- `@push.rocks/smartai/audio` — `textToSpeech()` using OpenAI SDK directly
|
- `@push.rocks/smartai/audio` — `textToSpeech()` using OpenAI SDK directly
|
||||||
- `@push.rocks/smartai/image` — `generateImage()`, `editImage()` using OpenAI SDK directly
|
- `@push.rocks/smartai/image` — `generateImage()`, `editImage()` using OpenAI SDK directly
|
||||||
- `@push.rocks/smartai/document` — `analyzeDocuments()` using SmartPdf + `generateText`
|
- `@push.rocks/smartai/document` — `analyzeDocuments()` using SmartPdf + `generateText`
|
||||||
|
- `@push.rocks/smartai/ocr` — `createMistralOcrEngine()` using Mistral Document AI OCR endpoint
|
||||||
- `@push.rocks/smartai/research` — `research()` using `@anthropic-ai/sdk` web_search tool
|
- `@push.rocks/smartai/research` — `research()` using `@anthropic-ai/sdk` web_search tool
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
@@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
|
|||||||
## Build
|
## Build
|
||||||
|
|
||||||
- `pnpm build` → `tsbuild tsfolders --allowimplicitany`
|
- `pnpm build` → `tsbuild tsfolders --allowimplicitany`
|
||||||
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/
|
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/
|
||||||
|
|
||||||
## Important Notes
|
## Important Notes
|
||||||
|
|
||||||
|
|||||||
@@ -479,6 +479,38 @@ console.log(analysis);
|
|||||||
await stopSmartpdf();
|
await stopSmartpdf();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 🔎 OCR — `@push.rocks/smartai/ocr`
|
||||||
|
|
||||||
|
Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { createMistralOcrEngine } from '@push.rocks/smartai/ocr';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
|
||||||
|
const ocr = createMistralOcrEngine({
|
||||||
|
apiKey: process.env.MISTRAL_API_KEY,
|
||||||
|
confidenceScoresGranularity: 'page',
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await ocr.recognizeImage({
|
||||||
|
dataBase64: fs.readFileSync('screenshot.png').toString('base64'),
|
||||||
|
mimeType: 'image/png',
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(result.text);
|
||||||
|
console.log(result.confidence);
|
||||||
|
```
|
||||||
|
|
||||||
|
**`createMistralOcrEngine(options)`** accepts:
|
||||||
|
|
||||||
|
- `apiKey` — Mistral API key, required unless a custom `transport` is supplied
|
||||||
|
- `model` — defaults to `mistral-ocr-latest`
|
||||||
|
- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr`
|
||||||
|
- `confidenceScoresGranularity` — `'page'` | `'word'`
|
||||||
|
- `tableFormat` — `'markdown'` | `'html'`
|
||||||
|
- `extractHeader` / `extractFooter` — optional document OCR flags
|
||||||
|
- `transport` — injectable transport for tests or custom HTTP clients
|
||||||
|
|
||||||
### 🔬 Research — `@push.rocks/smartai/research`
|
### 🔬 Research — `@push.rocks/smartai/research`
|
||||||
|
|
||||||
Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
|
Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
|
||||||
@@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation
|
|||||||
tstest test/test.research.ts --verbose # Web research
|
tstest test/test.research.ts --verbose # Web research
|
||||||
tstest test/test.audio.ts --verbose # Text-to-speech
|
tstest test/test.audio.ts --verbose # Text-to-speech
|
||||||
tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
|
tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
|
||||||
|
tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked)
|
||||||
```
|
```
|
||||||
|
|
||||||
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
|
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
|
||||||
@@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully
|
|||||||
├── ts_audio/ # @push.rocks/smartai/audio
|
├── ts_audio/ # @push.rocks/smartai/audio
|
||||||
├── ts_image/ # @push.rocks/smartai/image
|
├── ts_image/ # @push.rocks/smartai/image
|
||||||
├── ts_document/ # @push.rocks/smartai/document
|
├── ts_document/ # @push.rocks/smartai/document
|
||||||
|
├── ts_ocr/ # @push.rocks/smartai/ocr
|
||||||
└── ts_research/ # @push.rocks/smartai/research
|
└── ts_research/ # @push.rocks/smartai/research
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,77 @@
|
|||||||
|
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
||||||
|
import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js';
|
||||||
|
|
||||||
|
tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => {
|
||||||
|
const calls: unknown[] = [];
|
||||||
|
const mockTransport: ISmartAiMistralOcrTransport = {
|
||||||
|
process: async (request) => {
|
||||||
|
calls.push(request);
|
||||||
|
return {
|
||||||
|
pages: [
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
markdown: 'hello terminal',
|
||||||
|
confidence_scores: {
|
||||||
|
average_page_confidence_score: 0.91,
|
||||||
|
minimum_page_confidence_score: 0.8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
model: 'mistral-ocr-latest',
|
||||||
|
usage_info: {
|
||||||
|
pages_processed: 1,
|
||||||
|
doc_size_bytes: 12,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const ocrEngine = createMistralOcrEngine({
|
||||||
|
transport: mockTransport,
|
||||||
|
confidenceScoresGranularity: 'page',
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await ocrEngine.recognizeImage({
|
||||||
|
dataBase64: 'iVBORw0KGgo=',
|
||||||
|
mimeType: 'image/png',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(calls.length).toEqual(1);
|
||||||
|
expect((calls[0] as any).model).toEqual('mistral-ocr-latest');
|
||||||
|
expect((calls[0] as any).document.type).toEqual('image_url');
|
||||||
|
expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo=');
|
||||||
|
expect((calls[0] as any).confidence_scores_granularity).toEqual('page');
|
||||||
|
expect(result.text).toEqual('hello terminal');
|
||||||
|
expect(result.confidence).toEqual(0.91);
|
||||||
|
expect(result.pages).toEqual([
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
text: 'hello terminal',
|
||||||
|
confidence: 0.91,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
tap.test('createMistralOcrEngine should validate image input', async () => {
|
||||||
|
const ocrEngine = createMistralOcrEngine({
|
||||||
|
transport: {
|
||||||
|
process: async () => {
|
||||||
|
throw new Error('should not call OCR');
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
let error: Error | undefined;
|
||||||
|
try {
|
||||||
|
await ocrEngine.recognizeImage({
|
||||||
|
dataBase64: '',
|
||||||
|
mimeType: 'image/png',
|
||||||
|
});
|
||||||
|
} catch (caughtError) {
|
||||||
|
error = caughtError instanceof Error ? caughtError : new Error(String(caughtError));
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.');
|
||||||
|
});
|
||||||
|
|
||||||
|
export default tap.start();
|
||||||
@@ -199,15 +199,29 @@ tap.test('getModel uses ChatGPT Codex backend for OpenAI ChatGPT auth', async ()
|
|||||||
};
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await model.doGenerate({
|
await smartai.generateText({
|
||||||
prompt: [{ role: 'user', content: [{ type: 'text', text: 'hello' }] }],
|
model,
|
||||||
inputFormat: 'prompt',
|
system: 'system prompt',
|
||||||
} as any);
|
prompt: 'hello',
|
||||||
|
providerOptions: {
|
||||||
|
openai: {
|
||||||
|
promptCacheKey: 'session-1',
|
||||||
|
promptCacheRetention: 'in_memory',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
expect(capturedRequest?.url).toEqual('https://chatgpt.com/backend-api/codex/responses');
|
expect(capturedRequest?.url).toEqual('https://chatgpt.com/backend-api/codex/responses');
|
||||||
expect(getHeader(capturedRequest?.init, 'authorization')).toEqual(`Bearer ${tokenData.accessToken}`);
|
expect(getHeader(capturedRequest?.init, 'authorization')).toEqual(`Bearer ${tokenData.accessToken}`);
|
||||||
expect(getHeader(capturedRequest?.init, 'chatgpt-account-id')).toEqual('workspace-1');
|
expect(getHeader(capturedRequest?.init, 'chatgpt-account-id')).toEqual('workspace-1');
|
||||||
expect(getHeader(capturedRequest?.init, 'originator')).toEqual('smartai');
|
expect(getHeader(capturedRequest?.init, 'originator')).toEqual('smartai');
|
||||||
|
const capturedBody = JSON.parse(String(capturedRequest?.init?.body));
|
||||||
|
expect(capturedBody.instructions).toEqual('system prompt');
|
||||||
|
expect(capturedBody.input).toEqual([
|
||||||
|
{ role: 'user', content: [{ type: 'input_text', text: 'hello' }] },
|
||||||
|
]);
|
||||||
|
expect(capturedBody.prompt_cache_key).toEqual('session-1');
|
||||||
|
expect(capturedBody.prompt_cache_retention).toEqual(undefined);
|
||||||
} finally {
|
} finally {
|
||||||
globalThis.fetch = originalFetch;
|
globalThis.fetch = originalFetch;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: '@push.rocks/smartai',
|
name: '@push.rocks/smartai',
|
||||||
version: '4.0.0',
|
version: '4.1.0',
|
||||||
description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.'
|
description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import * as plugins from './plugins.js';
|
|||||||
import type { ISmartAiModelSetup, ISmartAiOptions, LanguageModelV3 } from './smartai.interfaces.js';
|
import type { ISmartAiModelSetup, ISmartAiOptions, LanguageModelV3 } from './smartai.interfaces.js';
|
||||||
import { createOllamaModel } from './smartai.provider.ollama.js';
|
import { createOllamaModel } from './smartai.provider.ollama.js';
|
||||||
import { createAnthropicCachingMiddleware } from './smartai.middleware.anthropic.js';
|
import { createAnthropicCachingMiddleware } from './smartai.middleware.anthropic.js';
|
||||||
|
import { createOpenAiChatGptInstructionsMiddleware } from './smartai.middleware.openai.js';
|
||||||
import { createOpenAiChatGptProviderSettings } from './smartai.auth.openai.js';
|
import { createOpenAiChatGptProviderSettings } from './smartai.auth.openai.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -28,7 +29,13 @@ export function getModel(options: ISmartAiOptions): LanguageModelV3 {
|
|||||||
? createOpenAiChatGptProviderSettings(options.openAiChatGptAuth)
|
? createOpenAiChatGptProviderSettings(options.openAiChatGptAuth)
|
||||||
: { apiKey: options.apiKey },
|
: { apiKey: options.apiKey },
|
||||||
);
|
);
|
||||||
return p(options.model) as LanguageModelV3;
|
const base = p(options.model) as LanguageModelV3;
|
||||||
|
return options.openAiChatGptAuth
|
||||||
|
? plugins.wrapLanguageModel({
|
||||||
|
model: base,
|
||||||
|
middleware: createOpenAiChatGptInstructionsMiddleware(),
|
||||||
|
}) as unknown as LanguageModelV3
|
||||||
|
: base;
|
||||||
}
|
}
|
||||||
case 'google': {
|
case 'google': {
|
||||||
const p = plugins.createGoogleGenerativeAI({ apiKey: options.apiKey });
|
const p = plugins.createGoogleGenerativeAI({ apiKey: options.apiKey });
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
import type { JSONObject, LanguageModelV3CallOptions, LanguageModelV3Middleware } from '@ai-sdk/provider';
|
||||||
|
|
||||||
|
const isNonEmptyString = (value: unknown): value is string => typeof value === 'string' && value.trim().length > 0;
|
||||||
|
|
||||||
|
const getSystemInstructions = (prompt: LanguageModelV3CallOptions['prompt']): string | undefined => {
|
||||||
|
const instructions = prompt
|
||||||
|
.filter((message) => message.role === 'system')
|
||||||
|
.map((message) => message.content)
|
||||||
|
.filter(isNonEmptyString);
|
||||||
|
|
||||||
|
return instructions.length > 0 ? instructions.join('\n') : undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
const hasUnsupportedChatGptCacheRetention = (options: JSONObject): boolean => {
|
||||||
|
return options.promptCacheRetention !== undefined || options.prompt_cache_retention !== undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ChatGPT's Codex backend requires top-level Responses API instructions.
|
||||||
|
* The standard OpenAI provider otherwise serializes system prompts as input items.
|
||||||
|
*/
|
||||||
|
export function createOpenAiChatGptInstructionsMiddleware(): LanguageModelV3Middleware {
|
||||||
|
return {
|
||||||
|
specificationVersion: 'v3',
|
||||||
|
transformParams: async ({ params }) => {
|
||||||
|
const instructions = getSystemInstructions(params.prompt);
|
||||||
|
const providerOptions = params.providerOptions ?? {};
|
||||||
|
const openAiProviderOptions = providerOptions.openai ?? {};
|
||||||
|
const shouldApplyInstructions = !!instructions && !isNonEmptyString(openAiProviderOptions.instructions);
|
||||||
|
const shouldStripCacheRetention = hasUnsupportedChatGptCacheRetention(openAiProviderOptions);
|
||||||
|
|
||||||
|
if (!shouldApplyInstructions && !shouldStripCacheRetention) {
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
const nextOpenAiProviderOptions: JSONObject = { ...openAiProviderOptions };
|
||||||
|
delete nextOpenAiProviderOptions.promptCacheRetention;
|
||||||
|
delete nextOpenAiProviderOptions.prompt_cache_retention;
|
||||||
|
if (shouldApplyInstructions) {
|
||||||
|
nextOpenAiProviderOptions.instructions = instructions;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...params,
|
||||||
|
prompt: shouldApplyInstructions
|
||||||
|
? params.prompt.filter((message) => message.role !== 'system')
|
||||||
|
: params.prompt,
|
||||||
|
providerOptions: {
|
||||||
|
...providerOptions,
|
||||||
|
openai: nextOpenAiProviderOptions,
|
||||||
|
},
|
||||||
|
} satisfies LanguageModelV3CallOptions;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
+192
@@ -0,0 +1,192 @@
|
|||||||
|
export type TSmartAiOcrImageMimeType =
|
||||||
|
| 'image/png'
|
||||||
|
| 'image/jpeg'
|
||||||
|
| 'image/webp'
|
||||||
|
| 'image/gif'
|
||||||
|
| string;
|
||||||
|
|
||||||
|
export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html';
|
||||||
|
|
||||||
|
export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word';
|
||||||
|
|
||||||
|
export interface ISmartAiOcrImageInput {
|
||||||
|
dataBase64: string;
|
||||||
|
mimeType: TSmartAiOcrImageMimeType;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiOcrPageResult {
|
||||||
|
index: number;
|
||||||
|
text: string;
|
||||||
|
confidence?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiOcrResult<TRaw = unknown> {
|
||||||
|
text: string;
|
||||||
|
confidence?: number;
|
||||||
|
pages: ISmartAiOcrPageResult[];
|
||||||
|
raw: TRaw;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiOcrEngine {
|
||||||
|
recognizeImage: (
|
||||||
|
input: ISmartAiOcrImageInput,
|
||||||
|
options?: ISmartAiMistralOcrRecognizeOptions
|
||||||
|
) => Promise<ISmartAiOcrResult<IMistralOcrResponse>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IMistralOcrPageConfidenceScores {
|
||||||
|
average_page_confidence_score?: number;
|
||||||
|
averagePageConfidenceScore?: number;
|
||||||
|
minimum_page_confidence_score?: number;
|
||||||
|
minimumPageConfidenceScore?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IMistralOcrPageResponse {
|
||||||
|
index: number;
|
||||||
|
markdown: string;
|
||||||
|
confidence_scores?: IMistralOcrPageConfidenceScores | null;
|
||||||
|
confidenceScores?: IMistralOcrPageConfidenceScores | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IMistralOcrResponse {
|
||||||
|
pages: IMistralOcrPageResponse[];
|
||||||
|
model: string;
|
||||||
|
document_annotation?: unknown;
|
||||||
|
documentAnnotation?: unknown;
|
||||||
|
usage_info?: unknown;
|
||||||
|
usageInfo?: unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IMistralOcrRequest {
|
||||||
|
model: string;
|
||||||
|
document: {
|
||||||
|
type: 'image_url';
|
||||||
|
image_url: string;
|
||||||
|
};
|
||||||
|
include_image_base64?: boolean;
|
||||||
|
table_format?: TSmartAiMistralOcrTableFormat;
|
||||||
|
extract_header?: boolean;
|
||||||
|
extract_footer?: boolean;
|
||||||
|
confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiMistralOcrTransport {
|
||||||
|
process: (request: IMistralOcrRequest) => Promise<IMistralOcrResponse>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiMistralOcrOptions {
|
||||||
|
apiKey?: string;
|
||||||
|
model?: string;
|
||||||
|
endpointUrl?: string;
|
||||||
|
transport?: ISmartAiMistralOcrTransport;
|
||||||
|
includeImageBase64?: boolean;
|
||||||
|
tableFormat?: TSmartAiMistralOcrTableFormat;
|
||||||
|
extractHeader?: boolean;
|
||||||
|
extractFooter?: boolean;
|
||||||
|
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartAiMistralOcrRecognizeOptions {
|
||||||
|
includeImageBase64?: boolean;
|
||||||
|
tableFormat?: TSmartAiMistralOcrTableFormat;
|
||||||
|
extractHeader?: boolean;
|
||||||
|
extractFooter?: boolean;
|
||||||
|
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultMistralOcrModel = 'mistral-ocr-latest';
|
||||||
|
const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr';
|
||||||
|
|
||||||
|
const createMistralOcrHttpTransport = (options: {
|
||||||
|
apiKey?: string;
|
||||||
|
endpointUrl?: string;
|
||||||
|
}): ISmartAiMistralOcrTransport => {
|
||||||
|
return {
|
||||||
|
process: async (request) => {
|
||||||
|
if (!options.apiKey) {
|
||||||
|
throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${options.apiKey}`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify(request),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorBody = await response.text();
|
||||||
|
throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (await response.json()) as IMistralOcrResponse;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => {
|
||||||
|
const confidenceScores = page.confidence_scores ?? page.confidenceScores;
|
||||||
|
return (
|
||||||
|
confidenceScores?.average_page_confidence_score ??
|
||||||
|
confidenceScores?.averagePageConfidenceScore
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export const createMistralOcrEngine = (
|
||||||
|
options: ISmartAiMistralOcrOptions = {}
|
||||||
|
): ISmartAiOcrEngine => {
|
||||||
|
const transport =
|
||||||
|
options.transport ??
|
||||||
|
createMistralOcrHttpTransport({
|
||||||
|
apiKey: options.apiKey,
|
||||||
|
endpointUrl: options.endpointUrl,
|
||||||
|
});
|
||||||
|
const model = options.model ?? defaultMistralOcrModel;
|
||||||
|
|
||||||
|
return {
|
||||||
|
recognizeImage: async (input, recognizeOptions = {}) => {
|
||||||
|
if (!input.dataBase64) {
|
||||||
|
throw new Error('Mistral OCR image input requires dataBase64.');
|
||||||
|
}
|
||||||
|
if (!input.mimeType) {
|
||||||
|
throw new Error('Mistral OCR image input requires mimeType.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await transport.process({
|
||||||
|
model,
|
||||||
|
document: {
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: `data:${input.mimeType};base64,${input.dataBase64}`,
|
||||||
|
},
|
||||||
|
include_image_base64:
|
||||||
|
recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false,
|
||||||
|
table_format: recognizeOptions.tableFormat ?? options.tableFormat,
|
||||||
|
extract_header: recognizeOptions.extractHeader ?? options.extractHeader,
|
||||||
|
extract_footer: recognizeOptions.extractFooter ?? options.extractFooter,
|
||||||
|
confidence_scores_granularity:
|
||||||
|
recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity,
|
||||||
|
});
|
||||||
|
|
||||||
|
const pages = response.pages.map((page) => ({
|
||||||
|
index: page.index,
|
||||||
|
text: page.markdown,
|
||||||
|
confidence: getPageConfidence(page),
|
||||||
|
}));
|
||||||
|
const pageConfidences = pages
|
||||||
|
.map((page) => page.confidence)
|
||||||
|
.filter((confidence): confidence is number => typeof confidence === 'number');
|
||||||
|
const confidence = pageConfidences.length
|
||||||
|
? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: pages.map((page) => page.text).join('\n\n').trim(),
|
||||||
|
confidence,
|
||||||
|
pages,
|
||||||
|
raw: response,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
};
|
||||||
Reference in New Issue
Block a user