Compare commits

..

2 Commits

Author SHA1 Message Date
jkunz a7ae676184 v4.1.0
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-05-18 00:13:37 +00:00
jkunz 269e948453 feat(ocr): add Mistral OCR engine with package export, tests, and documentation 2026-05-18 00:13:27 +00:00
7 changed files with 325 additions and 5 deletions
+11
View File
@@ -5,6 +5,17 @@
## 2026-05-18 - 4.1.0
### Features
- add Mistral OCR engine with package export, tests, and documentation (ocr)
- introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint
- adds OCR request/response types, configurable transport and options, and normalized page/confidence results
- includes mocked transport tests for OCR requests and input validation
- updates package metadata and README content to document the new OCR module
## 2026-05-14 - 4.0.2
### Fixes
+7 -2
View File
@@ -1,8 +1,8 @@
{
"name": "@push.rocks/smartai",
"version": "4.0.2",
"version": "4.1.0",
"private": false,
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.",
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.",
"main": "dist_ts/index.js",
"typings": "dist_ts/index.d.ts",
"type": "module",
@@ -27,6 +27,10 @@
"import": "./dist_ts_document/index.js",
"types": "./dist_ts_document/index.d.ts"
},
"./ocr": {
"import": "./dist_ts_ocr/index.js",
"types": "./dist_ts_ocr/index.d.ts"
},
"./research": {
"import": "./dist_ts_research/index.js",
"types": "./dist_ts_research/index.d.ts"
@@ -87,6 +91,7 @@
"ts_audio/**/*",
"ts_image/**/*",
"ts_document/**/*",
"ts_ocr/**/*",
"ts_research/**/*",
"ts_openai_chatgpt_auth/**/*",
"dist_*/**/*",
+2 -1
View File
@@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
- `@push.rocks/smartai/audio``textToSpeech()` using OpenAI SDK directly
- `@push.rocks/smartai/image``generateImage()`, `editImage()` using OpenAI SDK directly
- `@push.rocks/smartai/document``analyzeDocuments()` using SmartPdf + `generateText`
- `@push.rocks/smartai/ocr``createMistralOcrEngine()` using Mistral Document AI OCR endpoint
- `@push.rocks/smartai/research``research()` using `@anthropic-ai/sdk` web_search tool
## Dependencies
@@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
## Build
- `pnpm build``tsbuild tsfolders --allowimplicitany`
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/
## Important Notes
+34
View File
@@ -479,6 +479,38 @@ console.log(analysis);
await stopSmartpdf();
```
### 🔎 OCR — `@push.rocks/smartai/ocr`
Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested.
```typescript
import { createMistralOcrEngine } from '@push.rocks/smartai/ocr';
import * as fs from 'fs';
const ocr = createMistralOcrEngine({
apiKey: process.env.MISTRAL_API_KEY,
confidenceScoresGranularity: 'page',
});
const result = await ocr.recognizeImage({
dataBase64: fs.readFileSync('screenshot.png').toString('base64'),
mimeType: 'image/png',
});
console.log(result.text);
console.log(result.confidence);
```
**`createMistralOcrEngine(options)`** accepts:
- `apiKey` — Mistral API key, required unless a custom `transport` is supplied
- `model` — defaults to `mistral-ocr-latest`
- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr`
- `confidenceScoresGranularity``'page'` | `'word'`
- `tableFormat``'markdown'` | `'html'`
- `extractHeader` / `extractFooter` — optional document OCR flags
- `transport` — injectable transport for tests or custom HTTP clients
### 🔬 Research — `@push.rocks/smartai/research`
Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
@@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation
tstest test/test.research.ts --verbose # Web research
tstest test/test.audio.ts --verbose # Text-to-speech
tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked)
```
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
@@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully
├── ts_audio/ # @push.rocks/smartai/audio
├── ts_image/ # @push.rocks/smartai/image
├── ts_document/ # @push.rocks/smartai/document
├── ts_ocr/ # @push.rocks/smartai/ocr
└── ts_research/ # @push.rocks/smartai/research
```
+77
View File
@@ -0,0 +1,77 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js';
tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => {
const calls: unknown[] = [];
const mockTransport: ISmartAiMistralOcrTransport = {
process: async (request) => {
calls.push(request);
return {
pages: [
{
index: 0,
markdown: 'hello terminal',
confidence_scores: {
average_page_confidence_score: 0.91,
minimum_page_confidence_score: 0.8,
},
},
],
model: 'mistral-ocr-latest',
usage_info: {
pages_processed: 1,
doc_size_bytes: 12,
},
};
},
};
const ocrEngine = createMistralOcrEngine({
transport: mockTransport,
confidenceScoresGranularity: 'page',
});
const result = await ocrEngine.recognizeImage({
dataBase64: 'iVBORw0KGgo=',
mimeType: 'image/png',
});
expect(calls.length).toEqual(1);
expect((calls[0] as any).model).toEqual('mistral-ocr-latest');
expect((calls[0] as any).document.type).toEqual('image_url');
expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo=');
expect((calls[0] as any).confidence_scores_granularity).toEqual('page');
expect(result.text).toEqual('hello terminal');
expect(result.confidence).toEqual(0.91);
expect(result.pages).toEqual([
{
index: 0,
text: 'hello terminal',
confidence: 0.91,
},
]);
});
tap.test('createMistralOcrEngine should validate image input', async () => {
const ocrEngine = createMistralOcrEngine({
transport: {
process: async () => {
throw new Error('should not call OCR');
},
},
});
let error: Error | undefined;
try {
await ocrEngine.recognizeImage({
dataBase64: '',
mimeType: 'image/png',
});
} catch (caughtError) {
error = caughtError instanceof Error ? caughtError : new Error(String(caughtError));
}
expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.');
});
export default tap.start();
+2 -2
View File
@@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@push.rocks/smartai',
version: '4.0.2',
description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.'
version: '4.1.0',
description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.'
}
+192
View File
@@ -0,0 +1,192 @@
export type TSmartAiOcrImageMimeType =
| 'image/png'
| 'image/jpeg'
| 'image/webp'
| 'image/gif'
| string;
export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html';
export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word';
export interface ISmartAiOcrImageInput {
dataBase64: string;
mimeType: TSmartAiOcrImageMimeType;
}
export interface ISmartAiOcrPageResult {
index: number;
text: string;
confidence?: number;
}
export interface ISmartAiOcrResult<TRaw = unknown> {
text: string;
confidence?: number;
pages: ISmartAiOcrPageResult[];
raw: TRaw;
}
export interface ISmartAiOcrEngine {
recognizeImage: (
input: ISmartAiOcrImageInput,
options?: ISmartAiMistralOcrRecognizeOptions
) => Promise<ISmartAiOcrResult<IMistralOcrResponse>>;
}
export interface IMistralOcrPageConfidenceScores {
average_page_confidence_score?: number;
averagePageConfidenceScore?: number;
minimum_page_confidence_score?: number;
minimumPageConfidenceScore?: number;
}
export interface IMistralOcrPageResponse {
index: number;
markdown: string;
confidence_scores?: IMistralOcrPageConfidenceScores | null;
confidenceScores?: IMistralOcrPageConfidenceScores | null;
}
export interface IMistralOcrResponse {
pages: IMistralOcrPageResponse[];
model: string;
document_annotation?: unknown;
documentAnnotation?: unknown;
usage_info?: unknown;
usageInfo?: unknown;
}
export interface IMistralOcrRequest {
model: string;
document: {
type: 'image_url';
image_url: string;
};
include_image_base64?: boolean;
table_format?: TSmartAiMistralOcrTableFormat;
extract_header?: boolean;
extract_footer?: boolean;
confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrTransport {
process: (request: IMistralOcrRequest) => Promise<IMistralOcrResponse>;
}
export interface ISmartAiMistralOcrOptions {
apiKey?: string;
model?: string;
endpointUrl?: string;
transport?: ISmartAiMistralOcrTransport;
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrRecognizeOptions {
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
const defaultMistralOcrModel = 'mistral-ocr-latest';
const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr';
const createMistralOcrHttpTransport = (options: {
apiKey?: string;
endpointUrl?: string;
}): ISmartAiMistralOcrTransport => {
return {
process: async (request) => {
if (!options.apiKey) {
throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.');
}
const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, {
method: 'POST',
headers: {
Authorization: `Bearer ${options.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(request),
});
if (!response.ok) {
const errorBody = await response.text();
throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`);
}
return (await response.json()) as IMistralOcrResponse;
},
};
};
const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => {
const confidenceScores = page.confidence_scores ?? page.confidenceScores;
return (
confidenceScores?.average_page_confidence_score ??
confidenceScores?.averagePageConfidenceScore
);
};
export const createMistralOcrEngine = (
options: ISmartAiMistralOcrOptions = {}
): ISmartAiOcrEngine => {
const transport =
options.transport ??
createMistralOcrHttpTransport({
apiKey: options.apiKey,
endpointUrl: options.endpointUrl,
});
const model = options.model ?? defaultMistralOcrModel;
return {
recognizeImage: async (input, recognizeOptions = {}) => {
if (!input.dataBase64) {
throw new Error('Mistral OCR image input requires dataBase64.');
}
if (!input.mimeType) {
throw new Error('Mistral OCR image input requires mimeType.');
}
const response = await transport.process({
model,
document: {
type: 'image_url',
image_url: `data:${input.mimeType};base64,${input.dataBase64}`,
},
include_image_base64:
recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false,
table_format: recognizeOptions.tableFormat ?? options.tableFormat,
extract_header: recognizeOptions.extractHeader ?? options.extractHeader,
extract_footer: recognizeOptions.extractFooter ?? options.extractFooter,
confidence_scores_granularity:
recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity,
});
const pages = response.pages.map((page) => ({
index: page.index,
text: page.markdown,
confidence: getPageConfidence(page),
}));
const pageConfidences = pages
.map((page) => page.confidence)
.filter((confidence): confidence is number => typeof confidence === 'number');
const confidence = pageConfidences.length
? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length
: undefined;
return {
text: pages.map((page) => page.text).join('\n\n').trim(),
confidence,
pages,
raw: response,
};
},
};
};