feat(ocr): add Mistral OCR engine with package export, tests, and documentation

This commit is contained in:
2026-05-18 00:13:27 +00:00
parent 1d64ee3edb
commit 269e948453
6 changed files with 319 additions and 2 deletions
+8
View File
@@ -5,6 +5,14 @@
### Features
- add Mistral OCR engine with package export, tests, and documentation (ocr)
- introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint
- adds OCR request/response types, configurable transport and options, and normalized page/confidence results
- includes mocked transport tests for OCR requests and input validation
- updates package metadata and README content to document the new OCR module
## 2026-05-14 - 4.0.2 ## 2026-05-14 - 4.0.2
### Fixes ### Fixes
+6 -1
View File
@@ -2,7 +2,7 @@
"name": "@push.rocks/smartai", "name": "@push.rocks/smartai",
"version": "4.0.2", "version": "4.0.2",
"private": false, "private": false,
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.", "description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.",
"main": "dist_ts/index.js", "main": "dist_ts/index.js",
"typings": "dist_ts/index.d.ts", "typings": "dist_ts/index.d.ts",
"type": "module", "type": "module",
@@ -27,6 +27,10 @@
"import": "./dist_ts_document/index.js", "import": "./dist_ts_document/index.js",
"types": "./dist_ts_document/index.d.ts" "types": "./dist_ts_document/index.d.ts"
}, },
"./ocr": {
"import": "./dist_ts_ocr/index.js",
"types": "./dist_ts_ocr/index.d.ts"
},
"./research": { "./research": {
"import": "./dist_ts_research/index.js", "import": "./dist_ts_research/index.js",
"types": "./dist_ts_research/index.d.ts" "types": "./dist_ts_research/index.d.ts"
@@ -87,6 +91,7 @@
"ts_audio/**/*", "ts_audio/**/*",
"ts_image/**/*", "ts_image/**/*",
"ts_document/**/*", "ts_document/**/*",
"ts_ocr/**/*",
"ts_research/**/*", "ts_research/**/*",
"ts_openai_chatgpt_auth/**/*", "ts_openai_chatgpt_auth/**/*",
"dist_*/**/*", "dist_*/**/*",
+2 -1
View File
@@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
- `@push.rocks/smartai/audio``textToSpeech()` using OpenAI SDK directly - `@push.rocks/smartai/audio``textToSpeech()` using OpenAI SDK directly
- `@push.rocks/smartai/image``generateImage()`, `editImage()` using OpenAI SDK directly - `@push.rocks/smartai/image``generateImage()`, `editImage()` using OpenAI SDK directly
- `@push.rocks/smartai/document``analyzeDocuments()` using SmartPdf + `generateText` - `@push.rocks/smartai/document``analyzeDocuments()` using SmartPdf + `generateText`
- `@push.rocks/smartai/ocr``createMistralOcrEngine()` using Mistral Document AI OCR endpoint
- `@push.rocks/smartai/research``research()` using `@anthropic-ai/sdk` web_search tool - `@push.rocks/smartai/research``research()` using `@anthropic-ai/sdk` web_search tool
## Dependencies ## Dependencies
@@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
## Build ## Build
- `pnpm build``tsbuild tsfolders --allowimplicitany` - `pnpm build``tsbuild tsfolders --allowimplicitany`
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/ - Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/
## Important Notes ## Important Notes
+34
View File
@@ -479,6 +479,38 @@ console.log(analysis);
await stopSmartpdf(); await stopSmartpdf();
``` ```
### 🔎 OCR — `@push.rocks/smartai/ocr`
Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested.
```typescript
import { createMistralOcrEngine } from '@push.rocks/smartai/ocr';
import * as fs from 'fs';
const ocr = createMistralOcrEngine({
apiKey: process.env.MISTRAL_API_KEY,
confidenceScoresGranularity: 'page',
});
const result = await ocr.recognizeImage({
dataBase64: fs.readFileSync('screenshot.png').toString('base64'),
mimeType: 'image/png',
});
console.log(result.text);
console.log(result.confidence);
```
**`createMistralOcrEngine(options)`** accepts:
- `apiKey` — Mistral API key, required unless a custom `transport` is supplied
- `model` — defaults to `mistral-ocr-latest`
- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr`
- `confidenceScoresGranularity``'page'` | `'word'`
- `tableFormat``'markdown'` | `'html'`
- `extractHeader` / `extractFooter` — optional document OCR flags
- `transport` — injectable transport for tests or custom HTTP clients
### 🔬 Research — `@push.rocks/smartai/research` ### 🔬 Research — `@push.rocks/smartai/research`
Perform web-search-powered research using Anthropic's `web_search_20250305` tool. Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
@@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation
tstest test/test.research.ts --verbose # Web research tstest test/test.research.ts --verbose # Web research
tstest test/test.audio.ts --verbose # Text-to-speech tstest test/test.audio.ts --verbose # Text-to-speech
tstest test/test.document.ts --verbose # Document analysis (needs Chromium) tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked)
``` ```
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services. Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
@@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully
├── ts_audio/ # @push.rocks/smartai/audio ├── ts_audio/ # @push.rocks/smartai/audio
├── ts_image/ # @push.rocks/smartai/image ├── ts_image/ # @push.rocks/smartai/image
├── ts_document/ # @push.rocks/smartai/document ├── ts_document/ # @push.rocks/smartai/document
├── ts_ocr/ # @push.rocks/smartai/ocr
└── ts_research/ # @push.rocks/smartai/research └── ts_research/ # @push.rocks/smartai/research
``` ```
+77
View File
@@ -0,0 +1,77 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js';
tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => {
const calls: unknown[] = [];
const mockTransport: ISmartAiMistralOcrTransport = {
process: async (request) => {
calls.push(request);
return {
pages: [
{
index: 0,
markdown: 'hello terminal',
confidence_scores: {
average_page_confidence_score: 0.91,
minimum_page_confidence_score: 0.8,
},
},
],
model: 'mistral-ocr-latest',
usage_info: {
pages_processed: 1,
doc_size_bytes: 12,
},
};
},
};
const ocrEngine = createMistralOcrEngine({
transport: mockTransport,
confidenceScoresGranularity: 'page',
});
const result = await ocrEngine.recognizeImage({
dataBase64: 'iVBORw0KGgo=',
mimeType: 'image/png',
});
expect(calls.length).toEqual(1);
expect((calls[0] as any).model).toEqual('mistral-ocr-latest');
expect((calls[0] as any).document.type).toEqual('image_url');
expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo=');
expect((calls[0] as any).confidence_scores_granularity).toEqual('page');
expect(result.text).toEqual('hello terminal');
expect(result.confidence).toEqual(0.91);
expect(result.pages).toEqual([
{
index: 0,
text: 'hello terminal',
confidence: 0.91,
},
]);
});
tap.test('createMistralOcrEngine should validate image input', async () => {
const ocrEngine = createMistralOcrEngine({
transport: {
process: async () => {
throw new Error('should not call OCR');
},
},
});
let error: Error | undefined;
try {
await ocrEngine.recognizeImage({
dataBase64: '',
mimeType: 'image/png',
});
} catch (caughtError) {
error = caughtError instanceof Error ? caughtError : new Error(String(caughtError));
}
expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.');
});
export default tap.start();
+192
View File
@@ -0,0 +1,192 @@
export type TSmartAiOcrImageMimeType =
| 'image/png'
| 'image/jpeg'
| 'image/webp'
| 'image/gif'
| string;
export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html';
export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word';
export interface ISmartAiOcrImageInput {
dataBase64: string;
mimeType: TSmartAiOcrImageMimeType;
}
export interface ISmartAiOcrPageResult {
index: number;
text: string;
confidence?: number;
}
export interface ISmartAiOcrResult<TRaw = unknown> {
text: string;
confidence?: number;
pages: ISmartAiOcrPageResult[];
raw: TRaw;
}
export interface ISmartAiOcrEngine {
recognizeImage: (
input: ISmartAiOcrImageInput,
options?: ISmartAiMistralOcrRecognizeOptions
) => Promise<ISmartAiOcrResult<IMistralOcrResponse>>;
}
export interface IMistralOcrPageConfidenceScores {
average_page_confidence_score?: number;
averagePageConfidenceScore?: number;
minimum_page_confidence_score?: number;
minimumPageConfidenceScore?: number;
}
export interface IMistralOcrPageResponse {
index: number;
markdown: string;
confidence_scores?: IMistralOcrPageConfidenceScores | null;
confidenceScores?: IMistralOcrPageConfidenceScores | null;
}
export interface IMistralOcrResponse {
pages: IMistralOcrPageResponse[];
model: string;
document_annotation?: unknown;
documentAnnotation?: unknown;
usage_info?: unknown;
usageInfo?: unknown;
}
export interface IMistralOcrRequest {
model: string;
document: {
type: 'image_url';
image_url: string;
};
include_image_base64?: boolean;
table_format?: TSmartAiMistralOcrTableFormat;
extract_header?: boolean;
extract_footer?: boolean;
confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrTransport {
process: (request: IMistralOcrRequest) => Promise<IMistralOcrResponse>;
}
export interface ISmartAiMistralOcrOptions {
apiKey?: string;
model?: string;
endpointUrl?: string;
transport?: ISmartAiMistralOcrTransport;
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrRecognizeOptions {
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
const defaultMistralOcrModel = 'mistral-ocr-latest';
const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr';
const createMistralOcrHttpTransport = (options: {
apiKey?: string;
endpointUrl?: string;
}): ISmartAiMistralOcrTransport => {
return {
process: async (request) => {
if (!options.apiKey) {
throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.');
}
const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, {
method: 'POST',
headers: {
Authorization: `Bearer ${options.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(request),
});
if (!response.ok) {
const errorBody = await response.text();
throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`);
}
return (await response.json()) as IMistralOcrResponse;
},
};
};
const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => {
const confidenceScores = page.confidence_scores ?? page.confidenceScores;
return (
confidenceScores?.average_page_confidence_score ??
confidenceScores?.averagePageConfidenceScore
);
};
export const createMistralOcrEngine = (
options: ISmartAiMistralOcrOptions = {}
): ISmartAiOcrEngine => {
const transport =
options.transport ??
createMistralOcrHttpTransport({
apiKey: options.apiKey,
endpointUrl: options.endpointUrl,
});
const model = options.model ?? defaultMistralOcrModel;
return {
recognizeImage: async (input, recognizeOptions = {}) => {
if (!input.dataBase64) {
throw new Error('Mistral OCR image input requires dataBase64.');
}
if (!input.mimeType) {
throw new Error('Mistral OCR image input requires mimeType.');
}
const response = await transport.process({
model,
document: {
type: 'image_url',
image_url: `data:${input.mimeType};base64,${input.dataBase64}`,
},
include_image_base64:
recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false,
table_format: recognizeOptions.tableFormat ?? options.tableFormat,
extract_header: recognizeOptions.extractHeader ?? options.extractHeader,
extract_footer: recognizeOptions.extractFooter ?? options.extractFooter,
confidence_scores_granularity:
recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity,
});
const pages = response.pages.map((page) => ({
index: page.index,
text: page.markdown,
confidence: getPageConfidence(page),
}));
const pageConfidences = pages
.map((page) => page.confidence)
.filter((confidence): confidence is number => typeof confidence === 'number');
const confidence = pageConfidences.length
? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length
: undefined;
return {
text: pages.map((page) => page.text).join('\n\n').trim(),
confidence,
pages,
raw: response,
};
},
};
};