Compare commits

...

6 Commits

Author SHA1 Message Date
jkunz a7ae676184 v4.1.0
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-05-18 00:13:37 +00:00
jkunz 269e948453 feat(ocr): add Mistral OCR engine with package export, tests, and documentation 2026-05-18 00:13:27 +00:00
jkunz 1d64ee3edb v4.0.2
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-05-14 19:59:36 +00:00
jkunz 4725b55566 fix(openai): strip unsupported ChatGPT prompt cache retention options while preserving prompt cache keys 2026-05-14 19:59:30 +00:00
jkunz 0e2053f538 v4.0.1
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-05-14 19:47:26 +00:00
jkunz c8f98b3364 fix(openai): map system prompts to top-level instructions for ChatGPT auth requests 2026-05-14 19:47:17 +00:00
10 changed files with 426 additions and 10 deletions
+31
View File
@@ -3,6 +3,37 @@
## Pending ## Pending
## 2026-05-18 - 4.1.0
### Features
- add Mistral OCR engine with package export, tests, and documentation (ocr)
- introduces a new `@push.rocks/smartai/ocr` subpath export with `createMistralOcrEngine()` for image OCR via Mistral's Document AI endpoint
- adds OCR request/response types, configurable transport and options, and normalized page/confidence results
- includes mocked transport tests for OCR requests and input validation
- updates package metadata and README content to document the new OCR module
## 2026-05-14 - 4.0.2
### Fixes
- strip unsupported ChatGPT prompt cache retention options while preserving prompt cache keys (openai)
- Removes promptCacheRetention values before sending requests to the ChatGPT Codex backend.
- Keeps prompt_cache_key forwarding intact for OpenAI provider options.
- Only rewrites system prompts into top-level instructions when needed, avoiding unnecessary prompt changes.
## 2026-05-14 - 4.0.1
### Fixes
- map system prompts to top-level instructions for ChatGPT auth requests (openai)
- wrap OpenAI models using ChatGPT auth with middleware that extracts system messages into provider instructions
- remove system messages from the serialized prompt payload to match the ChatGPT Codex backend expectations
- add test coverage to verify authorization headers, workspace routing, and instruction payload mapping
## 2026-05-14 - 4.0.0 ## 2026-05-14 - 4.0.0
### Breaking Changes ### Breaking Changes
+7 -2
View File
@@ -1,8 +1,8 @@
{ {
"name": "@push.rocks/smartai", "name": "@push.rocks/smartai",
"version": "4.0.0", "version": "4.1.0",
"private": false, "private": false,
"description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.", "description": "Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.",
"main": "dist_ts/index.js", "main": "dist_ts/index.js",
"typings": "dist_ts/index.d.ts", "typings": "dist_ts/index.d.ts",
"type": "module", "type": "module",
@@ -27,6 +27,10 @@
"import": "./dist_ts_document/index.js", "import": "./dist_ts_document/index.js",
"types": "./dist_ts_document/index.d.ts" "types": "./dist_ts_document/index.d.ts"
}, },
"./ocr": {
"import": "./dist_ts_ocr/index.js",
"types": "./dist_ts_ocr/index.d.ts"
},
"./research": { "./research": {
"import": "./dist_ts_research/index.js", "import": "./dist_ts_research/index.js",
"types": "./dist_ts_research/index.d.ts" "types": "./dist_ts_research/index.d.ts"
@@ -87,6 +91,7 @@
"ts_audio/**/*", "ts_audio/**/*",
"ts_image/**/*", "ts_image/**/*",
"ts_document/**/*", "ts_document/**/*",
"ts_ocr/**/*",
"ts_research/**/*", "ts_research/**/*",
"ts_openai_chatgpt_auth/**/*", "ts_openai_chatgpt_auth/**/*",
"dist_*/**/*", "dist_*/**/*",
+2 -1
View File
@@ -18,6 +18,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
- `@push.rocks/smartai/audio``textToSpeech()` using OpenAI SDK directly - `@push.rocks/smartai/audio``textToSpeech()` using OpenAI SDK directly
- `@push.rocks/smartai/image``generateImage()`, `editImage()` using OpenAI SDK directly - `@push.rocks/smartai/image``generateImage()`, `editImage()` using OpenAI SDK directly
- `@push.rocks/smartai/document``analyzeDocuments()` using SmartPdf + `generateText` - `@push.rocks/smartai/document``analyzeDocuments()` using SmartPdf + `generateText`
- `@push.rocks/smartai/ocr``createMistralOcrEngine()` using Mistral Document AI OCR endpoint
- `@push.rocks/smartai/research``research()` using `@anthropic-ai/sdk` web_search tool - `@push.rocks/smartai/research``research()` using `@anthropic-ai/sdk` web_search tool
## Dependencies ## Dependencies
@@ -32,7 +33,7 @@ The package is a **provider registry** built on the Vercel AI SDK (`ai` v6). The
## Build ## Build
- `pnpm build``tsbuild tsfolders --allowimplicitany` - `pnpm build``tsbuild tsfolders --allowimplicitany`
- Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_research/ - Compiles: ts/, ts_vision/, ts_audio/, ts_image/, ts_document/, ts_ocr/, ts_research/
## Important Notes ## Important Notes
+34
View File
@@ -479,6 +479,38 @@ console.log(analysis);
await stopSmartpdf(); await stopSmartpdf();
``` ```
### 🔎 OCR — `@push.rocks/smartai/ocr`
Extract text from images using Mistral Document AI OCR. This uses the documented `https://api.mistral.ai/v1/ocr` endpoint with `mistral-ocr-latest` and returns normalized text plus page-level confidence when requested.
```typescript
import { createMistralOcrEngine } from '@push.rocks/smartai/ocr';
import * as fs from 'fs';
const ocr = createMistralOcrEngine({
apiKey: process.env.MISTRAL_API_KEY,
confidenceScoresGranularity: 'page',
});
const result = await ocr.recognizeImage({
dataBase64: fs.readFileSync('screenshot.png').toString('base64'),
mimeType: 'image/png',
});
console.log(result.text);
console.log(result.confidence);
```
**`createMistralOcrEngine(options)`** accepts:
- `apiKey` — Mistral API key, required unless a custom `transport` is supplied
- `model` — defaults to `mistral-ocr-latest`
- `endpointUrl` — defaults to `https://api.mistral.ai/v1/ocr`
- `confidenceScoresGranularity``'page'` | `'word'`
- `tableFormat``'markdown'` | `'html'`
- `extractHeader` / `extractFooter` — optional document OCR flags
- `transport` — injectable transport for tests or custom HTTP clients
### 🔬 Research — `@push.rocks/smartai/research` ### 🔬 Research — `@push.rocks/smartai/research`
Perform web-search-powered research using Anthropic's `web_search_20250305` tool. Perform web-search-powered research using Anthropic's `web_search_20250305` tool.
@@ -514,6 +546,7 @@ tstest test/test.image.ts --verbose # Image generation
tstest test/test.research.ts --verbose # Web research tstest test/test.research.ts --verbose # Web research
tstest test/test.audio.ts --verbose # Text-to-speech tstest test/test.audio.ts --verbose # Text-to-speech
tstest test/test.document.ts --verbose # Document analysis (needs Chromium) tstest test/test.document.ts --verbose # Document analysis (needs Chromium)
tstest test/test.ocr.ts --verbose # Mistral OCR transport (mocked)
``` ```
Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services. Most tests skip gracefully when API keys are not set. The Ollama tests are fully mocked and require no external services.
@@ -533,6 +566,7 @@ Most tests skip gracefully when API keys are not set. The Ollama tests are fully
├── ts_audio/ # @push.rocks/smartai/audio ├── ts_audio/ # @push.rocks/smartai/audio
├── ts_image/ # @push.rocks/smartai/image ├── ts_image/ # @push.rocks/smartai/image
├── ts_document/ # @push.rocks/smartai/document ├── ts_document/ # @push.rocks/smartai/document
├── ts_ocr/ # @push.rocks/smartai/ocr
└── ts_research/ # @push.rocks/smartai/research └── ts_research/ # @push.rocks/smartai/research
``` ```
+77
View File
@@ -0,0 +1,77 @@
import { tap, expect } from '@git.zone/tstest/tapbundle';
import { createMistralOcrEngine, type ISmartAiMistralOcrTransport } from '../ts_ocr/index.js';
tap.test('createMistralOcrEngine should call Mistral OCR with image data URLs', async () => {
const calls: unknown[] = [];
const mockTransport: ISmartAiMistralOcrTransport = {
process: async (request) => {
calls.push(request);
return {
pages: [
{
index: 0,
markdown: 'hello terminal',
confidence_scores: {
average_page_confidence_score: 0.91,
minimum_page_confidence_score: 0.8,
},
},
],
model: 'mistral-ocr-latest',
usage_info: {
pages_processed: 1,
doc_size_bytes: 12,
},
};
},
};
const ocrEngine = createMistralOcrEngine({
transport: mockTransport,
confidenceScoresGranularity: 'page',
});
const result = await ocrEngine.recognizeImage({
dataBase64: 'iVBORw0KGgo=',
mimeType: 'image/png',
});
expect(calls.length).toEqual(1);
expect((calls[0] as any).model).toEqual('mistral-ocr-latest');
expect((calls[0] as any).document.type).toEqual('image_url');
expect((calls[0] as any).document.image_url).toEqual('data:image/png;base64,iVBORw0KGgo=');
expect((calls[0] as any).confidence_scores_granularity).toEqual('page');
expect(result.text).toEqual('hello terminal');
expect(result.confidence).toEqual(0.91);
expect(result.pages).toEqual([
{
index: 0,
text: 'hello terminal',
confidence: 0.91,
},
]);
});
tap.test('createMistralOcrEngine should validate image input', async () => {
const ocrEngine = createMistralOcrEngine({
transport: {
process: async () => {
throw new Error('should not call OCR');
},
},
});
let error: Error | undefined;
try {
await ocrEngine.recognizeImage({
dataBase64: '',
mimeType: 'image/png',
});
} catch (caughtError) {
error = caughtError instanceof Error ? caughtError : new Error(String(caughtError));
}
expect(error?.message).toEqual('Mistral OCR image input requires dataBase64.');
});
export default tap.start();
+18 -4
View File
@@ -199,15 +199,29 @@ tap.test('getModel uses ChatGPT Codex backend for OpenAI ChatGPT auth', async ()
}; };
try { try {
await model.doGenerate({ await smartai.generateText({
prompt: [{ role: 'user', content: [{ type: 'text', text: 'hello' }] }], model,
inputFormat: 'prompt', system: 'system prompt',
} as any); prompt: 'hello',
providerOptions: {
openai: {
promptCacheKey: 'session-1',
promptCacheRetention: 'in_memory',
},
},
});
expect(capturedRequest?.url).toEqual('https://chatgpt.com/backend-api/codex/responses'); expect(capturedRequest?.url).toEqual('https://chatgpt.com/backend-api/codex/responses');
expect(getHeader(capturedRequest?.init, 'authorization')).toEqual(`Bearer ${tokenData.accessToken}`); expect(getHeader(capturedRequest?.init, 'authorization')).toEqual(`Bearer ${tokenData.accessToken}`);
expect(getHeader(capturedRequest?.init, 'chatgpt-account-id')).toEqual('workspace-1'); expect(getHeader(capturedRequest?.init, 'chatgpt-account-id')).toEqual('workspace-1');
expect(getHeader(capturedRequest?.init, 'originator')).toEqual('smartai'); expect(getHeader(capturedRequest?.init, 'originator')).toEqual('smartai');
const capturedBody = JSON.parse(String(capturedRequest?.init?.body));
expect(capturedBody.instructions).toEqual('system prompt');
expect(capturedBody.input).toEqual([
{ role: 'user', content: [{ type: 'input_text', text: 'hello' }] },
]);
expect(capturedBody.prompt_cache_key).toEqual('session-1');
expect(capturedBody.prompt_cache_retention).toEqual(undefined);
} finally { } finally {
globalThis.fetch = originalFetch; globalThis.fetch = originalFetch;
} }
+2 -2
View File
@@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: '@push.rocks/smartai', name: '@push.rocks/smartai',
version: '4.0.0', version: '4.1.0',
description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document and research capabilities.' description: 'Provider registry and capability utilities for ai-sdk (Vercel AI SDK). Core export returns LanguageModel; subpath exports provide vision, audio, image, document, OCR and research capabilities.'
} }
+8 -1
View File
@@ -2,6 +2,7 @@ import * as plugins from './plugins.js';
import type { ISmartAiModelSetup, ISmartAiOptions, LanguageModelV3 } from './smartai.interfaces.js'; import type { ISmartAiModelSetup, ISmartAiOptions, LanguageModelV3 } from './smartai.interfaces.js';
import { createOllamaModel } from './smartai.provider.ollama.js'; import { createOllamaModel } from './smartai.provider.ollama.js';
import { createAnthropicCachingMiddleware } from './smartai.middleware.anthropic.js'; import { createAnthropicCachingMiddleware } from './smartai.middleware.anthropic.js';
import { createOpenAiChatGptInstructionsMiddleware } from './smartai.middleware.openai.js';
import { createOpenAiChatGptProviderSettings } from './smartai.auth.openai.js'; import { createOpenAiChatGptProviderSettings } from './smartai.auth.openai.js';
/** /**
@@ -28,7 +29,13 @@ export function getModel(options: ISmartAiOptions): LanguageModelV3 {
? createOpenAiChatGptProviderSettings(options.openAiChatGptAuth) ? createOpenAiChatGptProviderSettings(options.openAiChatGptAuth)
: { apiKey: options.apiKey }, : { apiKey: options.apiKey },
); );
return p(options.model) as LanguageModelV3; const base = p(options.model) as LanguageModelV3;
return options.openAiChatGptAuth
? plugins.wrapLanguageModel({
model: base,
middleware: createOpenAiChatGptInstructionsMiddleware(),
}) as unknown as LanguageModelV3
: base;
} }
case 'google': { case 'google': {
const p = plugins.createGoogleGenerativeAI({ apiKey: options.apiKey }); const p = plugins.createGoogleGenerativeAI({ apiKey: options.apiKey });
+55
View File
@@ -0,0 +1,55 @@
import type { JSONObject, LanguageModelV3CallOptions, LanguageModelV3Middleware } from '@ai-sdk/provider';
const isNonEmptyString = (value: unknown): value is string => typeof value === 'string' && value.trim().length > 0;
const getSystemInstructions = (prompt: LanguageModelV3CallOptions['prompt']): string | undefined => {
const instructions = prompt
.filter((message) => message.role === 'system')
.map((message) => message.content)
.filter(isNonEmptyString);
return instructions.length > 0 ? instructions.join('\n') : undefined;
};
const hasUnsupportedChatGptCacheRetention = (options: JSONObject): boolean => {
return options.promptCacheRetention !== undefined || options.prompt_cache_retention !== undefined;
};
/**
* ChatGPT's Codex backend requires top-level Responses API instructions.
* The standard OpenAI provider otherwise serializes system prompts as input items.
*/
export function createOpenAiChatGptInstructionsMiddleware(): LanguageModelV3Middleware {
return {
specificationVersion: 'v3',
transformParams: async ({ params }) => {
const instructions = getSystemInstructions(params.prompt);
const providerOptions = params.providerOptions ?? {};
const openAiProviderOptions = providerOptions.openai ?? {};
const shouldApplyInstructions = !!instructions && !isNonEmptyString(openAiProviderOptions.instructions);
const shouldStripCacheRetention = hasUnsupportedChatGptCacheRetention(openAiProviderOptions);
if (!shouldApplyInstructions && !shouldStripCacheRetention) {
return params;
}
const nextOpenAiProviderOptions: JSONObject = { ...openAiProviderOptions };
delete nextOpenAiProviderOptions.promptCacheRetention;
delete nextOpenAiProviderOptions.prompt_cache_retention;
if (shouldApplyInstructions) {
nextOpenAiProviderOptions.instructions = instructions;
}
return {
...params,
prompt: shouldApplyInstructions
? params.prompt.filter((message) => message.role !== 'system')
: params.prompt,
providerOptions: {
...providerOptions,
openai: nextOpenAiProviderOptions,
},
} satisfies LanguageModelV3CallOptions;
},
};
}
+192
View File
@@ -0,0 +1,192 @@
export type TSmartAiOcrImageMimeType =
| 'image/png'
| 'image/jpeg'
| 'image/webp'
| 'image/gif'
| string;
export type TSmartAiMistralOcrTableFormat = 'markdown' | 'html';
export type TSmartAiMistralOcrConfidenceScoresGranularity = 'page' | 'word';
export interface ISmartAiOcrImageInput {
dataBase64: string;
mimeType: TSmartAiOcrImageMimeType;
}
export interface ISmartAiOcrPageResult {
index: number;
text: string;
confidence?: number;
}
export interface ISmartAiOcrResult<TRaw = unknown> {
text: string;
confidence?: number;
pages: ISmartAiOcrPageResult[];
raw: TRaw;
}
export interface ISmartAiOcrEngine {
recognizeImage: (
input: ISmartAiOcrImageInput,
options?: ISmartAiMistralOcrRecognizeOptions
) => Promise<ISmartAiOcrResult<IMistralOcrResponse>>;
}
export interface IMistralOcrPageConfidenceScores {
average_page_confidence_score?: number;
averagePageConfidenceScore?: number;
minimum_page_confidence_score?: number;
minimumPageConfidenceScore?: number;
}
export interface IMistralOcrPageResponse {
index: number;
markdown: string;
confidence_scores?: IMistralOcrPageConfidenceScores | null;
confidenceScores?: IMistralOcrPageConfidenceScores | null;
}
export interface IMistralOcrResponse {
pages: IMistralOcrPageResponse[];
model: string;
document_annotation?: unknown;
documentAnnotation?: unknown;
usage_info?: unknown;
usageInfo?: unknown;
}
export interface IMistralOcrRequest {
model: string;
document: {
type: 'image_url';
image_url: string;
};
include_image_base64?: boolean;
table_format?: TSmartAiMistralOcrTableFormat;
extract_header?: boolean;
extract_footer?: boolean;
confidence_scores_granularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrTransport {
process: (request: IMistralOcrRequest) => Promise<IMistralOcrResponse>;
}
export interface ISmartAiMistralOcrOptions {
apiKey?: string;
model?: string;
endpointUrl?: string;
transport?: ISmartAiMistralOcrTransport;
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
export interface ISmartAiMistralOcrRecognizeOptions {
includeImageBase64?: boolean;
tableFormat?: TSmartAiMistralOcrTableFormat;
extractHeader?: boolean;
extractFooter?: boolean;
confidenceScoresGranularity?: TSmartAiMistralOcrConfidenceScoresGranularity;
}
const defaultMistralOcrModel = 'mistral-ocr-latest';
const defaultMistralOcrEndpointUrl = 'https://api.mistral.ai/v1/ocr';
const createMistralOcrHttpTransport = (options: {
apiKey?: string;
endpointUrl?: string;
}): ISmartAiMistralOcrTransport => {
return {
process: async (request) => {
if (!options.apiKey) {
throw new Error('Mistral OCR requires an apiKey when no custom transport is provided.');
}
const response = await fetch(options.endpointUrl ?? defaultMistralOcrEndpointUrl, {
method: 'POST',
headers: {
Authorization: `Bearer ${options.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(request),
});
if (!response.ok) {
const errorBody = await response.text();
throw new Error(`Mistral OCR request failed with status ${response.status}: ${errorBody}`);
}
return (await response.json()) as IMistralOcrResponse;
},
};
};
const getPageConfidence = (page: IMistralOcrPageResponse): number | undefined => {
const confidenceScores = page.confidence_scores ?? page.confidenceScores;
return (
confidenceScores?.average_page_confidence_score ??
confidenceScores?.averagePageConfidenceScore
);
};
export const createMistralOcrEngine = (
options: ISmartAiMistralOcrOptions = {}
): ISmartAiOcrEngine => {
const transport =
options.transport ??
createMistralOcrHttpTransport({
apiKey: options.apiKey,
endpointUrl: options.endpointUrl,
});
const model = options.model ?? defaultMistralOcrModel;
return {
recognizeImage: async (input, recognizeOptions = {}) => {
if (!input.dataBase64) {
throw new Error('Mistral OCR image input requires dataBase64.');
}
if (!input.mimeType) {
throw new Error('Mistral OCR image input requires mimeType.');
}
const response = await transport.process({
model,
document: {
type: 'image_url',
image_url: `data:${input.mimeType};base64,${input.dataBase64}`,
},
include_image_base64:
recognizeOptions.includeImageBase64 ?? options.includeImageBase64 ?? false,
table_format: recognizeOptions.tableFormat ?? options.tableFormat,
extract_header: recognizeOptions.extractHeader ?? options.extractHeader,
extract_footer: recognizeOptions.extractFooter ?? options.extractFooter,
confidence_scores_granularity:
recognizeOptions.confidenceScoresGranularity ?? options.confidenceScoresGranularity,
});
const pages = response.pages.map((page) => ({
index: page.index,
text: page.markdown,
confidence: getPageConfidence(page),
}));
const pageConfidences = pages
.map((page) => page.confidence)
.filter((confidence): confidence is number => typeof confidence === 'number');
const confidence = pageConfidences.length
? pageConfidences.reduce((sum, value) => sum + value, 0) / pageConfidences.length
: undefined;
return {
text: pages.map((page) => page.text).join('\n\n').trim(),
confidence,
pages,
raw: response,
};
},
};
};