diff --git a/changelog.md b/changelog.md index 695a756..78c084f 100644 --- a/changelog.md +++ b/changelog.md @@ -5,6 +5,14 @@ +### Features + +- add Mistral OCR engine with package export, tests, and documentation (ocr) + - introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint + - adds OCR request/response types, configurable transport and options, and normalized page/confidence results + - includes mocked transport tests for OCR requests and input validation + - updates package metadata and README content to document the new OCR module + ## 2026-05-14 - 4.0.2 ### Fixes diff --git a/package.json b/package.json index f4f4023..294b840 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "@push.rocks/smartai", "version": "4.0.2", "private": false, - "description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.", + "description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.", "main": "dist_ts/index.js", "typings": "dist_ts/index.d.ts", "type": "module", @@ -27,6 +27,10 @@ "import": "./dist_ts_document/index.js", "types": "./dist_ts_document/index.d.ts" }, + "./ocr": { + "import": "./dist_ts_ocr/index.js", + "types": "./dist_ts_ocr/index.d.ts" + }, "./research": { "import": "./dist_ts_research/index.js", "types": "./dist_ts_research/index.d.ts" @@ -87,6 +91,7 @@ "ts_audio/**/*", "ts_image/**/*", "ts_document/**/*", + "ts_ocr/**/*", "ts_research/**/*", "ts_openai_chatgpt_auth/**/*", "dist_*/**/*", diff --git a/readme.hints.md b/readme.hints.md index 5d2d377..e8b4174 100644 --- a/readme.hints.md +++ b/readme.hints.md @@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The - `@push.rocks/smartai/audio` — `textToSpeech()` using OpenAI SDK directly - `@push.rocks/smartai/image` — `generateImage()`, `editImage()` using OpenAI SDK directly - `@push.rocks/smartai/document` — `analyzeDocuments()` using SmartPdf + `generateText` +- `@push.rocks/smartai/ocr` — `createMistralOcrEngine()` using Mistral Document AI OCR endpoint - `@push.rocks/smartai/research` — `research()` using `@anthropic-ai/sdk` web_search tool ## Dependencies @@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The ## Build - `pnpm build` → `tsbuild tsfolders --allowimplicitany` -- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/ +- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/ ## Important Notes diff --git a/readme.md b/readme.md index 0418961..70a89eb 100644 --- a/readme.md +++ b/readme.md @@ -479,6 +479,38 @@ console.log(analysis); await stopSmartpdf(); ``` +### 🔎 OCR — `@push.rocks/smartai/ocr` + +Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested. + +```typescript +import { createMistralOcrEngine } from '@push.rocks/smartai/ocr'; +import * as fs from 'fs'; + +const ocr = createMistralOcrEngine({ + apiKey: process.env.MISTRAL_API_KEY, + confidenceScoresGranularity: 'page', +}); + +const result = await ocr.recognizeImage({ + dataBase64: fs.readFileSync('screenshot.png').toString('base64'), + mimeType: 'image/png', +}); + +console.log(result.text); +console.log(result.confidence); +``` + +**`createMistralOcrEngine(options)`** accepts: + +- `apiKey` — Mistral API key, required unless a custom `transport` is supplied +- `model` — defaults to `mistral-ocr-latest` +- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr` +- `confidenceScoresGranularity` — `'page'` | `'word'` +- `tableFormat` — `'markdown'` | `'html'` +- `extractHeader` / `extractFooter` — optional document OCR flags +- `transport` — injectable transport for tests or custom HTTP clients + ### 🔬 Research — `@push.rocks/smartai/research` Perform web-search-powered research using Anthropic's `web_search_20250305` tool. @@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation tstest test/test.research.ts --verbose # Web research tstest test/test.audio.ts --verbose # Text-to-speech tstest test/test.document.ts --verbose # Document analysis (needs Chromium) +tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked) ``` Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services. @@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully ├── ts_audio/ # @push.rocks/smartai/audio ├── ts_image/ # @push.rocks/smartai/image ├── ts_document/ # @push.rocks/smartai/document +├── ts_ocr/ # @push.rocks/smartai/ocr └── ts_research/ # @push.rocks/smartai/research ``` diff --git a/test/test.ocr.ts b/test/test.ocr.ts new file mode 100644 index 0000000..a65dfc5 --- /dev/null +++ b/test/test.ocr.ts @@ -0,0 +1,77 @@ +import { tap, expect } from '@git.zone/tstest/tapbundle'; +import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js'; + +tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => { + const calls: unknown[] = []; + const mockTransport: ISmartAiMistralOcrTransport = { + process: async (request) => { + calls.push(request); + return { + pages: [ + { + index: 0, + markdown: 'hello terminal', + confidence_scores: { + average_page_confidence_score: 0.91, + minimum_page_confidence_score: 0.8, + }, + }, + ], + model: 'mistral-ocr-latest', + usage_info: { + pages_processed: 1, + doc_size_bytes: 12, + }, + }; + }, + }; + + const ocrEngine = createMistralOcrEngine({ + transport: mockTransport, + confidenceScoresGranularity: 'page', + }); + + const result = await ocrEngine.recognizeImage({ + dataBase64: 'iVBORw0KGgo=', + mimeType: 'image/png', + }); + + expect(calls.length).toEqual(1); + expect((calls[0] as any).model).toEqual('mistral-ocr-latest'); + expect((calls[0] as any).document.type).toEqual('image_url'); + expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo='); + expect((calls[0] as any).confidence_scores_granularity).toEqual('page'); + expect(result.text).toEqual('hello terminal'); + expect(result.confidence).toEqual(0.91); + expect(result.pages).toEqual([ + { + index: 0, + text: 'hello terminal', + confidence: 0.91, + }, + ]); +}); + +tap.test('createMistralOcrEngine should validate image input', async () => { + const ocrEngine = createMistralOcrEngine({ + transport: { + process: async () => { + throw new Error('should not call OCR'); + }, + }, + }); + + let error: Error | undefined; + try { + await ocrEngine.recognizeImage({ + dataBase64: '', + mimeType: 'image/png', + }); + } catch (caughtError) { + error = caughtError instanceof Error ? caughtError : new Error(String(caughtError)); + } + + expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.'); +}); + +export default tap.start(); diff --git a/ts_ocr/index.ts b/ts_ocr/index.ts new file mode 100644 index 0000000..f851c76 --- /dev/null +++ b/ts_ocr/index.ts @@ -0,0 +1,192 @@ +export type TSmartAiOcrImageMimeType = + | 'image/png' + | 'image/jpeg' + | 'image/webp' + | 'image/gif' + | string; + +export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html'; + +export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word'; + +export interface ISmartAiOcrImageInput { + dataBase64: string; + mimeType: TSmartAiOcrImageMimeType; +} + +export interface ISmartAiOcrPageResult { + index: number; + text: string; + confidence?: number; +} + +export interface ISmartAiOcrResult { + text: string; + confidence?: number; + pages: ISmartAiOcrPageResult[]; + raw: TRaw; +} + +export interface ISmartAiOcrEngine { + recognizeImage: ( + input: ISmartAiOcrImageInput, + options?: ISmartAiMistralOcrRecognizeOptions + ) => Promise>; +} + +export interface IMistralOcrPageConfidenceScores { + average_page_confidence_score?: number; + averagePageConfidenceScore?: number; + minimum_page_confidence_score?: number; + minimumPageConfidenceScore?: number; +} + +export interface IMistralOcrPageResponse { + index: number; + markdown: string; + confidence_scores?: IMistralOcrPageConfidenceScores | null; + confidenceScores?: IMistralOcrPageConfidenceScores | null; +} + +export interface IMistralOcrResponse { + pages: IMistralOcrPageResponse[]; + model: string; + document_annotation?: unknown; + documentAnnotation?: unknown; + usage_info?: unknown; + usageInfo?: unknown; +} + +export interface IMistralOcrRequest { + model: string; + document: { + type: 'image_url'; + image_url: string; + }; + include_image_base64?: boolean; + table_format?: TSmartAiMistralOcrTableFormat; + extract_header?: boolean; + extract_footer?: boolean; + confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity; +} + +export interface ISmartAiMistralOcrTransport { + process: (request: IMistralOcrRequest) => Promise; +} + +export interface ISmartAiMistralOcrOptions { + apiKey?: string; + model?: string; + endpointUrl?: string; + transport?: ISmartAiMistralOcrTransport; + includeImageBase64?: boolean; + tableFormat?: TSmartAiMistralOcrTableFormat; + extractHeader?: boolean; + extractFooter?: boolean; + confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity; +} + +export interface ISmartAiMistralOcrRecognizeOptions { + includeImageBase64?: boolean; + tableFormat?: TSmartAiMistralOcrTableFormat; + extractHeader?: boolean; + extractFooter?: boolean; + confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity; +} + +const defaultMistralOcrModel = 'mistral-ocr-latest'; +const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr'; + +const createMistralOcrHttpTransport = (options: { + apiKey?: string; + endpointUrl?: string; +}): ISmartAiMistralOcrTransport => { + return { + process: async (request) => { + if (!options.apiKey) { + throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.'); + } + + const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, { + method: 'POST', + headers: { + Authorization: `Bearer ${options.apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(request), + }); + + if (!response.ok) { + const errorBody = await response.text(); + throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`); + } + + return (await response.json()) as IMistralOcrResponse; + }, + }; +}; + +const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => { + const confidenceScores = page.confidence_scores ?? page.confidenceScores; + return ( + confidenceScores?.average_page_confidence_score ?? + confidenceScores?.averagePageConfidenceScore + ); +}; + +export const createMistralOcrEngine = ( + options: ISmartAiMistralOcrOptions = {} +): ISmartAiOcrEngine => { + const transport = + options.transport ?? + createMistralOcrHttpTransport({ + apiKey: options.apiKey, + endpointUrl: options.endpointUrl, + }); + const model = options.model ?? defaultMistralOcrModel; + + return { + recognizeImage: async (input, recognizeOptions = {}) => { + if (!input.dataBase64) { + throw new Error('Mistral OCR image input requires dataBase64.'); + } + if (!input.mimeType) { + throw new Error('Mistral OCR image input requires mimeType.'); + } + + const response = await transport.process({ + model, + document: { + type: 'image_url', + image_url: `data:${input.mimeType};base64,${input.dataBase64}`, + }, + include_image_base64: + recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false, + table_format: recognizeOptions.tableFormat ?? options.tableFormat, + extract_header: recognizeOptions.extractHeader ?? options.extractHeader, + extract_footer: recognizeOptions.extractFooter ?? options.extractFooter, + confidence_scores_granularity: + recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity, + }); + + const pages = response.pages.map((page) => ({ + index: page.index, + text: page.markdown, + confidence: getPageConfidence(page), + })); + const pageConfidences = pages + .map((page) => page.confidence) + .filter((confidence): confidence is number => typeof confidence === 'number'); + const confidence = pageConfidences.length + ? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length + : undefined; + + return { + text: pages.map((page) => page.text).join('\n\n').trim(), + confidence, + pages, + raw: response, + }; + }, + }; +};