feat(ocr): add Mistral OCR engine with package export, tests, and documentation
This commit is contained in:
@@ -5,6 +5,14 @@
|
||||
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
- add Mistral OCR engine with package export, tests, and documentation (ocr)
|
||||
- introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint
|
||||
- adds OCR request/response types, configurable transport and options, and normalized page/confidence results
|
||||
- includes mocked transport tests for OCR requests and input validation
|
||||
- updates package metadata and README content to document the new OCR module
|
||||
|
||||
## 2026-05-14 - 4.0.2
|
||||
|
||||
### Fixes
|
||||
|
||||
+6
-1
@@ -2,7 +2,7 @@
|
||||
"name": "@push.rocks/smartai",
|
||||
"version": "4.0.2",
|
||||
"private": false,
|
||||
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.",
|
||||
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.",
|
||||
"main": "dist_ts/index.js",
|
||||
"typings": "dist_ts/index.d.ts",
|
||||
"type": "module",
|
||||
@@ -27,6 +27,10 @@
|
||||
"import": "./dist_ts_document/index.js",
|
||||
"types": "./dist_ts_document/index.d.ts"
|
||||
},
|
||||
"./ocr": {
|
||||
"import": "./dist_ts_ocr/index.js",
|
||||
"types": "./dist_ts_ocr/index.d.ts"
|
||||
},
|
||||
"./research": {
|
||||
"import": "./dist_ts_research/index.js",
|
||||
"types": "./dist_ts_research/index.d.ts"
|
||||
@@ -87,6 +91,7 @@
|
||||
"ts_audio/**/*",
|
||||
"ts_image/**/*",
|
||||
"ts_document/**/*",
|
||||
"ts_ocr/**/*",
|
||||
"ts_research/**/*",
|
||||
"ts_openai_chatgpt_auth/**/*",
|
||||
"dist_*/**/*",
|
||||
|
||||
+2
-1
@@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
|
||||
- `@push.rocks/smartai/audio` — `textToSpeech()` using OpenAI SDK directly
|
||||
- `@push.rocks/smartai/image` — `generateImage()`, `editImage()` using OpenAI SDK directly
|
||||
- `@push.rocks/smartai/document` — `analyzeDocuments()` using SmartPdf + `generateText`
|
||||
- `@push.rocks/smartai/ocr` — `createMistralOcrEngine()` using Mistral Document AI OCR endpoint
|
||||
- `@push.rocks/smartai/research` — `research()` using `@anthropic-ai/sdk` web_search tool
|
||||
|
||||
## Dependencies
|
||||
@@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
|
||||
## Build
|
||||
|
||||
- `pnpm build` → `tsbuild tsfolders --allowimplicitany`
|
||||
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/
|
||||
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/
|
||||
|
||||
## Important Notes
|
||||
|
||||
|
||||
@@ -479,6 +479,38 @@ console.log(analysis);
|
||||
await stopSmartpdf();
|
||||
```
|
||||
|
||||
### 🔎 OCR — `@push.rocks/smartai/ocr`
|
||||
|
||||
Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested.
|
||||
|
||||
```typescript
|
||||
import { createMistralOcrEngine } from '@push.rocks/smartai/ocr';
|
||||
import * as fs from 'fs';
|
||||
|
||||
const ocr = createMistralOcrEngine({
|
||||
apiKey: process.env.MISTRAL_API_KEY,
|
||||
confidenceScoresGranularity: 'page',
|
||||
});
|
||||
|
||||
const result = await ocr.recognizeImage({
|
||||
dataBase64: fs.readFileSync('screenshot.png').toString('base64'),
|
||||
mimeType: 'image/png',
|
||||
});
|
||||
|
||||
console.log(result.text);
|
||||
console.log(result.confidence);
|
||||
```
|
||||
|
||||
**`createMistralOcrEngine(options)`** accepts:
|
||||
|
||||
- `apiKey` — Mistral API key, required unless a custom `transport` is supplied
|
||||
- `model` — defaults to `mistral-ocr-latest`
|
||||
- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr`
|
||||
- `confidenceScoresGranularity` — `'page'` | `'word'`
|
||||
- `tableFormat` — `'markdown'` | `'html'`
|
||||
- `extractHeader` / `extractFooter` — optional document OCR flags
|
||||
- `transport` — injectable transport for tests or custom HTTP clients
|
||||
|
||||
### 🔬 Research — `@push.rocks/smartai/research`
|
||||
|
||||
Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
|
||||
@@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation
|
||||
tstest test/test.research.ts --verbose # Web research
|
||||
tstest test/test.audio.ts --verbose # Text-to-speech
|
||||
tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
|
||||
tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked)
|
||||
```
|
||||
|
||||
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
|
||||
@@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully
|
||||
├── ts_audio/ # @push.rocks/smartai/audio
|
||||
├── ts_image/ # @push.rocks/smartai/image
|
||||
├── ts_document/ # @push.rocks/smartai/document
|
||||
├── ts_ocr/ # @push.rocks/smartai/ocr
|
||||
└── ts_research/ # @push.rocks/smartai/research
|
||||
```
|
||||
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
import { tap, expect } from '@git.zone/tstest/tapbundle';
|
||||
import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js';
|
||||
|
||||
tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => {
|
||||
const calls: unknown[] = [];
|
||||
const mockTransport: ISmartAiMistralOcrTransport = {
|
||||
process: async (request) => {
|
||||
calls.push(request);
|
||||
return {
|
||||
pages: [
|
||||
{
|
||||
index: 0,
|
||||
markdown: 'hello terminal',
|
||||
confidence_scores: {
|
||||
average_page_confidence_score: 0.91,
|
||||
minimum_page_confidence_score: 0.8,
|
||||
},
|
||||
},
|
||||
],
|
||||
model: 'mistral-ocr-latest',
|
||||
usage_info: {
|
||||
pages_processed: 1,
|
||||
doc_size_bytes: 12,
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const ocrEngine = createMistralOcrEngine({
|
||||
transport: mockTransport,
|
||||
confidenceScoresGranularity: 'page',
|
||||
});
|
||||
|
||||
const result = await ocrEngine.recognizeImage({
|
||||
dataBase64: 'iVBORw0KGgo=',
|
||||
mimeType: 'image/png',
|
||||
});
|
||||
|
||||
expect(calls.length).toEqual(1);
|
||||
expect((calls[0] as any).model).toEqual('mistral-ocr-latest');
|
||||
expect((calls[0] as any).document.type).toEqual('image_url');
|
||||
expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo=');
|
||||
expect((calls[0] as any).confidence_scores_granularity).toEqual('page');
|
||||
expect(result.text).toEqual('hello terminal');
|
||||
expect(result.confidence).toEqual(0.91);
|
||||
expect(result.pages).toEqual([
|
||||
{
|
||||
index: 0,
|
||||
text: 'hello terminal',
|
||||
confidence: 0.91,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
tap.test('createMistralOcrEngine should validate image input', async () => {
|
||||
const ocrEngine = createMistralOcrEngine({
|
||||
transport: {
|
||||
process: async () => {
|
||||
throw new Error('should not call OCR');
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
let error: Error | undefined;
|
||||
try {
|
||||
await ocrEngine.recognizeImage({
|
||||
dataBase64: '',
|
||||
mimeType: 'image/png',
|
||||
});
|
||||
} catch (caughtError) {
|
||||
error = caughtError instanceof Error ? caughtError : new Error(String(caughtError));
|
||||
}
|
||||
|
||||
expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.');
|
||||
});
|
||||
|
||||
export default tap.start();
|
||||
+192
@@ -0,0 +1,192 @@
|
||||
export type TSmartAiOcrImageMimeType =
|
||||
| 'image/png'
|
||||
| 'image/jpeg'
|
||||
| 'image/webp'
|
||||
| 'image/gif'
|
||||
| string;
|
||||
|
||||
export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html';
|
||||
|
||||
export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word';
|
||||
|
||||
export interface ISmartAiOcrImageInput {
|
||||
dataBase64: string;
|
||||
mimeType: TSmartAiOcrImageMimeType;
|
||||
}
|
||||
|
||||
export interface ISmartAiOcrPageResult {
|
||||
index: number;
|
||||
text: string;
|
||||
confidence?: number;
|
||||
}
|
||||
|
||||
export interface ISmartAiOcrResult<TRaw = unknown> {
|
||||
text: string;
|
||||
confidence?: number;
|
||||
pages: ISmartAiOcrPageResult[];
|
||||
raw: TRaw;
|
||||
}
|
||||
|
||||
export interface ISmartAiOcrEngine {
|
||||
recognizeImage: (
|
||||
input: ISmartAiOcrImageInput,
|
||||
options?: ISmartAiMistralOcrRecognizeOptions
|
||||
) => Promise<ISmartAiOcrResult<IMistralOcrResponse>>;
|
||||
}
|
||||
|
||||
export interface IMistralOcrPageConfidenceScores {
|
||||
average_page_confidence_score?: number;
|
||||
averagePageConfidenceScore?: number;
|
||||
minimum_page_confidence_score?: number;
|
||||
minimumPageConfidenceScore?: number;
|
||||
}
|
||||
|
||||
export interface IMistralOcrPageResponse {
|
||||
index: number;
|
||||
markdown: string;
|
||||
confidence_scores?: IMistralOcrPageConfidenceScores | null;
|
||||
confidenceScores?: IMistralOcrPageConfidenceScores | null;
|
||||
}
|
||||
|
||||
export interface IMistralOcrResponse {
|
||||
pages: IMistralOcrPageResponse[];
|
||||
model: string;
|
||||
document_annotation?: unknown;
|
||||
documentAnnotation?: unknown;
|
||||
usage_info?: unknown;
|
||||
usageInfo?: unknown;
|
||||
}
|
||||
|
||||
export interface IMistralOcrRequest {
|
||||
model: string;
|
||||
document: {
|
||||
type: 'image_url';
|
||||
image_url: string;
|
||||
};
|
||||
include_image_base64?: boolean;
|
||||
table_format?: TSmartAiMistralOcrTableFormat;
|
||||
extract_header?: boolean;
|
||||
extract_footer?: boolean;
|
||||
confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||
}
|
||||
|
||||
export interface ISmartAiMistralOcrTransport {
|
||||
process: (request: IMistralOcrRequest) => Promise<IMistralOcrResponse>;
|
||||
}
|
||||
|
||||
export interface ISmartAiMistralOcrOptions {
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
endpointUrl?: string;
|
||||
transport?: ISmartAiMistralOcrTransport;
|
||||
includeImageBase64?: boolean;
|
||||
tableFormat?: TSmartAiMistralOcrTableFormat;
|
||||
extractHeader?: boolean;
|
||||
extractFooter?: boolean;
|
||||
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||
}
|
||||
|
||||
export interface ISmartAiMistralOcrRecognizeOptions {
|
||||
includeImageBase64?: boolean;
|
||||
tableFormat?: TSmartAiMistralOcrTableFormat;
|
||||
extractHeader?: boolean;
|
||||
extractFooter?: boolean;
|
||||
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
|
||||
}
|
||||
|
||||
const defaultMistralOcrModel = 'mistral-ocr-latest';
|
||||
const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr';
|
||||
|
||||
const createMistralOcrHttpTransport = (options: {
|
||||
apiKey?: string;
|
||||
endpointUrl?: string;
|
||||
}): ISmartAiMistralOcrTransport => {
|
||||
return {
|
||||
process: async (request) => {
|
||||
if (!options.apiKey) {
|
||||
throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.');
|
||||
}
|
||||
|
||||
const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${options.apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(request),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`);
|
||||
}
|
||||
|
||||
return (await response.json()) as IMistralOcrResponse;
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => {
|
||||
const confidenceScores = page.confidence_scores ?? page.confidenceScores;
|
||||
return (
|
||||
confidenceScores?.average_page_confidence_score ??
|
||||
confidenceScores?.averagePageConfidenceScore
|
||||
);
|
||||
};
|
||||
|
||||
export const createMistralOcrEngine = (
|
||||
options: ISmartAiMistralOcrOptions = {}
|
||||
): ISmartAiOcrEngine => {
|
||||
const transport =
|
||||
options.transport ??
|
||||
createMistralOcrHttpTransport({
|
||||
apiKey: options.apiKey,
|
||||
endpointUrl: options.endpointUrl,
|
||||
});
|
||||
const model = options.model ?? defaultMistralOcrModel;
|
||||
|
||||
return {
|
||||
recognizeImage: async (input, recognizeOptions = {}) => {
|
||||
if (!input.dataBase64) {
|
||||
throw new Error('Mistral OCR image input requires dataBase64.');
|
||||
}
|
||||
if (!input.mimeType) {
|
||||
throw new Error('Mistral OCR image input requires mimeType.');
|
||||
}
|
||||
|
||||
const response = await transport.process({
|
||||
model,
|
||||
document: {
|
||||
type: 'image_url',
|
||||
image_url: `data:${input.mimeType};base64,${input.dataBase64}`,
|
||||
},
|
||||
include_image_base64:
|
||||
recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false,
|
||||
table_format: recognizeOptions.tableFormat ?? options.tableFormat,
|
||||
extract_header: recognizeOptions.extractHeader ?? options.extractHeader,
|
||||
extract_footer: recognizeOptions.extractFooter ?? options.extractFooter,
|
||||
confidence_scores_granularity:
|
||||
recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity,
|
||||
});
|
||||
|
||||
const pages = response.pages.map((page) => ({
|
||||
index: page.index,
|
||||
text: page.markdown,
|
||||
confidence: getPageConfidence(page),
|
||||
}));
|
||||
const pageConfidences = pages
|
||||
.map((page) => page.confidence)
|
||||
.filter((confidence): confidence is number => typeof confidence === 'number');
|
||||
const confidence = pageConfidences.length
|
||||
? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length
|
||||
: undefined;
|
||||
|
||||
return {
|
||||
text: pages.map((page) => page.text).join('\n\n').trim(),
|
||||
confidence,
|
||||
pages,
|
||||
raw: response,
|
||||
};
|
||||
},
|
||||
};
|
||||
};
|
||||
Reference in New Issue
Block a user