feat(ocr): add smartai extraction support

This commit is contained in:
2026-05-19 06:42:42 +00:00
parent d86a83d515
commit 30780e7514
8 changed files with 9864 additions and 3355 deletions
+5 -3
View File
@@ -21,8 +21,9 @@
"@types/node": "^20.12.7"
},
"dependencies": {
"@push.rocks/smartfile": "^11.0.14",
"@push.rocks/smartpath": "^5.0.14",
"@push.rocks/smartai": "^4.1.0",
"@push.rocks/smartfile": "^13.1.3",
"@push.rocks/smartpath": "^6.0.0",
"@push.rocks/smartpromise": "^4.0.2",
"@push.rocks/smartshell": "^3.0.3",
"@push.rocks/smartunique": "^3.0.3"
@@ -60,5 +61,6 @@
"url": "https://code.foss.global/push.rocks/smartocr/issues"
},
"homepage": "https://code.foss.global/push.rocks/smartocr",
"type": "module"
"type": "module",
"packageManager": "pnpm@10.28.2"
}
+9636 -3316
View File
File diff suppressed because it is too large Load Diff
+53 -4
View File
@@ -1,18 +1,19 @@
# @push.rocks/smartocr
an ocr module using ocrmypdf
OCR utilities for PDF text-layer generation with `ocrmypdf` and optional SmartAI-powered text extraction.
## Install
To install `@push.rocks/smartocr`, use the following command with npm:
To install `@push.rocks/smartocr`, use pnpm:
```bash
npm install @push.rocks/smartocr --save
pnpm install @push.rocks/smartocr
```
This module depends on a few external utilities like `ocrmypdf`, so make sure you have these installed and available in your system's PATH. Consult the `ocrmypdf` documentation for installation instructions suitable for your operating system.
## Usage
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. Here's how to leverage it in your TypeScript project.
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. It can also call SmartAI OCR for image buffers, or SmartAI document analysis for PDF text extraction with a vision-capable model.
### Preparing Your Project
@@ -45,6 +46,54 @@ await fs.promises.writeFile('./path/to/output/document_ocr.pdf', ocredPdfBuffer)
In the example above, we import the `SmartOcr` class and use it to process a PDF by passing a `Buffer` of the PDF file to the `processPdfBuffer` method. The method returns a `Buffer` of the processed PDF which includes a text layer added by OCR.
### SmartAI Image OCR
For image inputs, use SmartAI's OCR engine. By default this uses the Mistral OCR engine from `@push.rocks/smartai/ocr`; pass `mistralOcrOptions.apiKey`, set `MISTRAL_API_KEY`, or inject a custom `smartAiOcrEngine`.
```typescript
import { SmartOcr } from '@push.rocks/smartocr';
import * as fs from 'fs';
const smartOcr = await SmartOcr.createAndInit({
mistralOcrOptions: {
apiKey: process.env.MISTRAL_API_KEY,
confidenceScoresGranularity: 'page',
},
});
const imageBuffer = await fs.promises.readFile('./scan.png');
const result = await smartOcr.recognizeImageBufferWithSmartAi(imageBuffer, {
mimeType: 'image/png',
});
console.log(result.text);
console.log(result.confidence);
```
### SmartAI PDF Text Extraction
For cases where you want extracted text instead of a searchable PDF, pass a SmartAI model to `extractTextFromPdfBufferWithSmartAi()`. This uses `@push.rocks/smartai/document`, which converts PDF pages to images and asks a vision-capable model to extract text.
```typescript
import { SmartOcr } from '@push.rocks/smartocr';
import { getModel } from '@push.rocks/smartai';
import * as fs from 'fs';
const smartOcr = await SmartOcr.createAndInit();
const model = getModel({
provider: 'anthropic',
model: 'claude-sonnet-4-5-20250929',
apiKey: process.env.ANTHROPIC_TOKEN,
});
const pdfBuffer = await fs.promises.readFile('./scan.pdf');
const extractedText = await smartOcr.extractTextFromPdfBufferWithSmartAi(pdfBuffer, {
model,
});
console.log(extractedText);
```
### Advanced Usage
The `SmartOcr` class maintains an internal `smartshell` instance to interface with the `ocrmypdf` command. This setup is abstracted away, ensuring you don't need to manage or understand the underlying shell commands to use OCR functionality in your application.
+45 -3
View File
@@ -1,5 +1,6 @@
import { expect, tap } from '@push.rocks/tapbundle';
import * as smartocr from '../ts/index.js';
import type { ISmartAiOcrEngine } from '@push.rocks/smartai/ocr';
let testOcrInstance: smartocr.SmartOcr;
@@ -8,11 +9,52 @@ tap.test('should create a valid instance of Smartocr', async () => {
expect(testOcrInstance).toBeInstanceOf(smartocr.SmartOcr);
});
tap.test('should recognize image buffers through SmartAI OCR', async () => {
const calls: unknown[] = [];
const smartAiOcrEngine: ISmartAiOcrEngine = {
recognizeImage: async (input) => {
calls.push(input);
return {
text: 'hello from smartai',
confidence: 0.92,
pages: [
{
index: 0,
text: 'hello from smartai',
confidence: 0.92,
},
],
raw: {
pages: [
{
index: 0,
markdown: 'hello from smartai',
},
],
model: 'mock-ocr',
},
};
},
};
const result = await testOcrInstance.recognizeImageBufferWithSmartAi(Buffer.from('image data'), {
mimeType: 'image/png',
smartAiOcrEngine,
});
expect(result.text).toEqual('hello from smartai');
expect(result.confidence).toEqual(0.92);
expect(calls[0]).toEqual({
dataBase64: Buffer.from('image data').toString('base64'),
mimeType: 'image/png',
});
});
tap.test('should ocr a pdfBuffer', async () => {
const smartfile = await import('@push.rocks/smartfile');
const pdfBuffer = (await smartfile.SmartFile.fromFilePath('./test/demo_without_textlayer.pdf'))
.contentBuffer;
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfBuffer);
const smartfileFactory = smartfile.SmartFileFactory.nodeFs();
const pdfFile = await smartfileFactory.fromFilePath('./test/demo_without_textlayer.pdf');
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfFile.contentBuffer);
console.log(resultBuffer);
});
+1
View File
@@ -1 +1,2 @@
export * from './smartocr.classes.smartocr.js';
export type * from './smartocr.interfaces.js';
+68 -6
View File
@@ -1,40 +1,102 @@
import * as plugins from './smartocr.plugins.js';
import * as paths from './smartocr.paths.js';
import type {
ISmartOcrConstructorOptions,
ISmartOcrImageAiOptions,
ISmartOcrPdfAiOptions,
TSmartOcrAiResult,
} from './smartocr.interfaces.js';
const defaultSmartAiPdfOcrSystemMessage = 'You are a precise OCR engine. Extract text faithfully and do not summarize.';
const defaultSmartAiPdfOcrUserMessage = 'Extract all readable text from this PDF. Preserve page order, line breaks, table structure, and obvious headings. Return only the extracted text.';
export class SmartOcr {
// STATIC
public static async createAndInit() {
const smartocrInstance = new SmartOcr();
public static async createAndInit(optionsArg: ISmartOcrConstructorOptions = {}) {
const smartocrInstance = new SmartOcr(optionsArg);
await smartocrInstance.init();
return smartocrInstance;
}
// INSTANCE
private options: ISmartOcrConstructorOptions;
public readyDeferred = plugins.smartpromise.defer();
public smartfileFactory = plugins.smartfile.SmartFileFactory.nodeFs();
public smartshellInstance: plugins.smartshell.Smartshell;
public async processPdfBuffer (pdfBufferArg: Buffer): Promise<Buffer> {
const uniqueString = plugins.smartunique.uni('doc_');
const originalPath = plugins.path.join(paths.noGitDir, `${uniqueString}.pdf`);
const processedPath = plugins.path.join(paths.noGitDir, `${uniqueString}_processed.pdf`);
const originalSmartfile = await plugins.smartfile.SmartFile.fromBuffer(originalPath, pdfBufferArg);
const originalSmartfile = this.smartfileFactory.fromBuffer(originalPath, pdfBufferArg, paths.noGitDir);
await originalSmartfile.write();
await this.smartshellInstance.exec(`ocrmypdf --rotate-pages ${originalPath} ${processedPath}`);
const processedSmartfile = await plugins.smartfile.SmartFile.fromFilePath(processedPath);
const processedSmartfile = await this.smartfileFactory.fromFilePath(processedPath, paths.noGitDir);
await originalSmartfile.delete();
await processedSmartfile.delete();
return processedSmartfile.contentBuffer;
}
constructor() {
public async recognizeImageBufferWithSmartAi(
imageBufferArg: Buffer,
optionsArg: ISmartOcrImageAiOptions,
): Promise<TSmartOcrAiResult> {
const smartAiOcrEngine = this.getSmartAiOcrEngine(optionsArg);
return smartAiOcrEngine.recognizeImage(
{
dataBase64: imageBufferArg.toString('base64'),
mimeType: optionsArg.mimeType,
},
optionsArg.recognizeOptions,
);
}
public async extractTextFromPdfBufferWithSmartAi(
pdfBufferArg: Buffer,
optionsArg: ISmartOcrPdfAiOptions,
): Promise<string> {
return plugins.smartaiDocument.analyzeDocuments({
model: optionsArg.model,
systemMessage: optionsArg.systemMessage ?? defaultSmartAiPdfOcrSystemMessage,
userMessage: optionsArg.userMessage ?? defaultSmartAiPdfOcrUserMessage,
pdfDocuments: [pdfBufferArg],
messageHistory: optionsArg.messageHistory,
});
}
constructor(optionsArg: ISmartOcrConstructorOptions = {}) {
this.options = optionsArg;
this.smartshellInstance = new plugins.smartshell.Smartshell({
executor: 'bash'
});
}
public async init() {
await plugins.smartfile.fs.ensureDir(paths.noGitDir);
await this.smartfileFactory.getSmartFs().directory(paths.noGitDir).recursive().create();
const result = await plugins.smartshell.which('ocrmypdf');
}
private getSmartAiOcrEngine(optionsArg: ISmartOcrImageAiOptions): plugins.smartaiOcr.ISmartAiOcrEngine {
if (optionsArg.smartAiOcrEngine) {
return optionsArg.smartAiOcrEngine;
}
if (this.options.smartAiOcrEngine) {
return this.options.smartAiOcrEngine;
}
const mistralOcrOptions = {
...this.options.mistralOcrOptions,
...optionsArg.mistralOcrOptions,
};
const apiKey = mistralOcrOptions.apiKey ?? process.env.MISTRAL_API_KEY;
if (!apiKey) {
throw new Error('SmartAI OCR requires smartAiOcrEngine, mistralOcrOptions.apiKey, or MISTRAL_API_KEY.');
}
return plugins.smartaiOcr.createMistralOcrEngine({
...mistralOcrOptions,
apiKey,
});
}
}
+29
View File
@@ -0,0 +1,29 @@
import type { ModelMessage, LanguageModelV3 } from '@push.rocks/smartai';
import type {
ISmartAiMistralOcrOptions,
ISmartAiMistralOcrRecognizeOptions,
ISmartAiOcrEngine,
ISmartAiOcrResult,
TSmartAiOcrImageMimeType,
} from '@push.rocks/smartai/ocr';
export interface ISmartOcrConstructorOptions {
smartAiOcrEngine?: ISmartAiOcrEngine;
mistralOcrOptions?: ISmartAiMistralOcrOptions;
}
export interface ISmartOcrImageAiOptions {
mimeType: TSmartAiOcrImageMimeType;
smartAiOcrEngine?: ISmartAiOcrEngine;
mistralOcrOptions?: ISmartAiMistralOcrOptions;
recognizeOptions?: ISmartAiMistralOcrRecognizeOptions;
}
export interface ISmartOcrPdfAiOptions {
model: LanguageModelV3;
systemMessage?: string;
userMessage?: string;
messageHistory?: ModelMessage[];
}
export type TSmartOcrAiResult<TRaw = unknown> = ISmartAiOcrResult<TRaw>;
+4
View File
@@ -7,6 +7,8 @@ export {
// @pushrocks scope
import * as smartfile from '@push.rocks/smartfile';
import * as smartaiDocument from '@push.rocks/smartai/document';
import * as smartaiOcr from '@push.rocks/smartai/ocr';
import * as smartshell from '@push.rocks/smartshell';
import * as smartunique from '@push.rocks/smartunique';
import * as smartpath from '@push.rocks/smartpath';
@@ -14,6 +16,8 @@ import * as smartpromise from '@push.rocks/smartpromise';
export {
smartfile,
smartaiDocument,
smartaiOcr,
smartshell,
smartunique,
smartpath,