feat(ocr): add smartai extraction support
This commit is contained in:
+6
-4
@@ -21,8 +21,9 @@
|
||||
"@types/node": "^20.12.7"
|
||||
},
|
||||
"dependencies": {
|
||||
"@push.rocks/smartfile": "^11.0.14",
|
||||
"@push.rocks/smartpath": "^5.0.14",
|
||||
"@push.rocks/smartai": "^4.1.0",
|
||||
"@push.rocks/smartfile": "^13.1.3",
|
||||
"@push.rocks/smartpath": "^6.0.0",
|
||||
"@push.rocks/smartpromise": "^4.0.2",
|
||||
"@push.rocks/smartshell": "^3.0.3",
|
||||
"@push.rocks/smartunique": "^3.0.3"
|
||||
@@ -60,5 +61,6 @@
|
||||
"url": "https://code.foss.global/push.rocks/smartocr/issues"
|
||||
},
|
||||
"homepage": "https://code.foss.global/push.rocks/smartocr",
|
||||
"type": "module"
|
||||
}
|
||||
"type": "module",
|
||||
"packageManager": "pnpm@10.28.2"
|
||||
}
|
||||
|
||||
Generated
+9657
-3337
File diff suppressed because it is too large
Load Diff
@@ -1,18 +1,19 @@
|
||||
# @push.rocks/smartocr
|
||||
an ocr module using ocrmypdf
|
||||
|
||||
OCR utilities for PDF text-layer generation with `ocrmypdf` and optional SmartAI-powered text extraction.
|
||||
|
||||
## Install
|
||||
To install `@push.rocks/smartocr`, use the following command with npm:
|
||||
To install `@push.rocks/smartocr`, use pnpm:
|
||||
|
||||
```bash
|
||||
npm install @push.rocks/smartocr --save
|
||||
pnpm install @push.rocks/smartocr
|
||||
```
|
||||
|
||||
This module depends on a few external utilities like `ocrmypdf`, so make sure you have these installed and available in your system's PATH. Consult the `ocrmypdf` documentation for installation instructions suitable for your operating system.
|
||||
|
||||
## Usage
|
||||
|
||||
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. Here's how to leverage it in your TypeScript project.
|
||||
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. It can also call SmartAI OCR for image buffers, or SmartAI document analysis for PDF text extraction with a vision-capable model.
|
||||
|
||||
### Preparing Your Project
|
||||
|
||||
@@ -45,6 +46,54 @@ await fs.promises.writeFile('./path/to/output/document_ocr.pdf', ocredPdfBuffer)
|
||||
|
||||
In the example above, we import the `SmartOcr` class and use it to process a PDF by passing a `Buffer` of the PDF file to the `processPdfBuffer` method. The method returns a `Buffer` of the processed PDF which includes a text layer added by OCR.
|
||||
|
||||
### SmartAI Image OCR
|
||||
|
||||
For image inputs, use SmartAI's OCR engine. By default this uses the Mistral OCR engine from `@push.rocks/smartai/ocr`; pass `mistralOcrOptions.apiKey`, set `MISTRAL_API_KEY`, or inject a custom `smartAiOcrEngine`.
|
||||
|
||||
```typescript
|
||||
import { SmartOcr } from '@push.rocks/smartocr';
|
||||
import * as fs from 'fs';
|
||||
|
||||
const smartOcr = await SmartOcr.createAndInit({
|
||||
mistralOcrOptions: {
|
||||
apiKey: process.env.MISTRAL_API_KEY,
|
||||
confidenceScoresGranularity: 'page',
|
||||
},
|
||||
});
|
||||
|
||||
const imageBuffer = await fs.promises.readFile('./scan.png');
|
||||
const result = await smartOcr.recognizeImageBufferWithSmartAi(imageBuffer, {
|
||||
mimeType: 'image/png',
|
||||
});
|
||||
|
||||
console.log(result.text);
|
||||
console.log(result.confidence);
|
||||
```
|
||||
|
||||
### SmartAI PDF Text Extraction
|
||||
|
||||
For cases where you want extracted text instead of a searchable PDF, pass a SmartAI model to `extractTextFromPdfBufferWithSmartAi()`. This uses `@push.rocks/smartai/document`, which converts PDF pages to images and asks a vision-capable model to extract text.
|
||||
|
||||
```typescript
|
||||
import { SmartOcr } from '@push.rocks/smartocr';
|
||||
import { getModel } from '@push.rocks/smartai';
|
||||
import * as fs from 'fs';
|
||||
|
||||
const smartOcr = await SmartOcr.createAndInit();
|
||||
const model = getModel({
|
||||
provider: 'anthropic',
|
||||
model: 'claude-sonnet-4-5-20250929',
|
||||
apiKey: process.env.ANTHROPIC_TOKEN,
|
||||
});
|
||||
|
||||
const pdfBuffer = await fs.promises.readFile('./scan.pdf');
|
||||
const extractedText = await smartOcr.extractTextFromPdfBufferWithSmartAi(pdfBuffer, {
|
||||
model,
|
||||
});
|
||||
|
||||
console.log(extractedText);
|
||||
```
|
||||
|
||||
### Advanced Usage
|
||||
|
||||
The `SmartOcr` class maintains an internal `smartshell` instance to interface with the `ocrmypdf` command. This setup is abstracted away, ensuring you don't need to manage or understand the underlying shell commands to use OCR functionality in your application.
|
||||
|
||||
+45
-3
@@ -1,5 +1,6 @@
|
||||
import { expect, tap } from '@push.rocks/tapbundle';
|
||||
import * as smartocr from '../ts/index.js';
|
||||
import type { ISmartAiOcrEngine } from '@push.rocks/smartai/ocr';
|
||||
|
||||
let testOcrInstance: smartocr.SmartOcr;
|
||||
|
||||
@@ -8,11 +9,52 @@ tap.test('should create a valid instance of Smartocr', async () => {
|
||||
expect(testOcrInstance).toBeInstanceOf(smartocr.SmartOcr);
|
||||
});
|
||||
|
||||
tap.test('should recognize image buffers through SmartAI OCR', async () => {
|
||||
const calls: unknown[] = [];
|
||||
const smartAiOcrEngine: ISmartAiOcrEngine = {
|
||||
recognizeImage: async (input) => {
|
||||
calls.push(input);
|
||||
return {
|
||||
text: 'hello from smartai',
|
||||
confidence: 0.92,
|
||||
pages: [
|
||||
{
|
||||
index: 0,
|
||||
text: 'hello from smartai',
|
||||
confidence: 0.92,
|
||||
},
|
||||
],
|
||||
raw: {
|
||||
pages: [
|
||||
{
|
||||
index: 0,
|
||||
markdown: 'hello from smartai',
|
||||
},
|
||||
],
|
||||
model: 'mock-ocr',
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const result = await testOcrInstance.recognizeImageBufferWithSmartAi(Buffer.from('image data'), {
|
||||
mimeType: 'image/png',
|
||||
smartAiOcrEngine,
|
||||
});
|
||||
|
||||
expect(result.text).toEqual('hello from smartai');
|
||||
expect(result.confidence).toEqual(0.92);
|
||||
expect(calls[0]).toEqual({
|
||||
dataBase64: Buffer.from('image data').toString('base64'),
|
||||
mimeType: 'image/png',
|
||||
});
|
||||
});
|
||||
|
||||
tap.test('should ocr a pdfBuffer', async () => {
|
||||
const smartfile = await import('@push.rocks/smartfile');
|
||||
const pdfBuffer = (await smartfile.SmartFile.fromFilePath('./test/demo_without_textlayer.pdf'))
|
||||
.contentBuffer;
|
||||
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfBuffer);
|
||||
const smartfileFactory = smartfile.SmartFileFactory.nodeFs();
|
||||
const pdfFile = await smartfileFactory.fromFilePath('./test/demo_without_textlayer.pdf');
|
||||
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfFile.contentBuffer);
|
||||
console.log(resultBuffer);
|
||||
});
|
||||
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
export * from './smartocr.classes.smartocr.js';
|
||||
export type * from './smartocr.interfaces.js';
|
||||
|
||||
@@ -1,40 +1,102 @@
|
||||
import * as plugins from './smartocr.plugins.js';
|
||||
import * as paths from './smartocr.paths.js';
|
||||
import type {
|
||||
ISmartOcrConstructorOptions,
|
||||
ISmartOcrImageAiOptions,
|
||||
ISmartOcrPdfAiOptions,
|
||||
TSmartOcrAiResult,
|
||||
} from './smartocr.interfaces.js';
|
||||
|
||||
const defaultSmartAiPdfOcrSystemMessage = 'You are a precise OCR engine. Extract text faithfully and do not summarize.';
|
||||
const defaultSmartAiPdfOcrUserMessage = 'Extract all readable text from this PDF. Preserve page order, line breaks, table structure, and obvious headings. Return only the extracted text.';
|
||||
|
||||
export class SmartOcr {
|
||||
// STATIC
|
||||
public static async createAndInit() {
|
||||
const smartocrInstance = new SmartOcr();
|
||||
public static async createAndInit(optionsArg: ISmartOcrConstructorOptions = {}) {
|
||||
const smartocrInstance = new SmartOcr(optionsArg);
|
||||
await smartocrInstance.init();
|
||||
return smartocrInstance;
|
||||
}
|
||||
|
||||
// INSTANCE
|
||||
|
||||
private options: ISmartOcrConstructorOptions;
|
||||
public readyDeferred = plugins.smartpromise.defer();
|
||||
public smartfileFactory = plugins.smartfile.SmartFileFactory.nodeFs();
|
||||
public smartshellInstance: plugins.smartshell.Smartshell;
|
||||
|
||||
public async processPdfBuffer (pdfBufferArg: Buffer): Promise<Buffer> {
|
||||
const uniqueString = plugins.smartunique.uni('doc_');
|
||||
const originalPath = plugins.path.join(paths.noGitDir, `${uniqueString}.pdf`);
|
||||
const processedPath = plugins.path.join(paths.noGitDir, `${uniqueString}_processed.pdf`);
|
||||
const originalSmartfile = await plugins.smartfile.SmartFile.fromBuffer(originalPath, pdfBufferArg);
|
||||
const originalSmartfile = this.smartfileFactory.fromBuffer(originalPath, pdfBufferArg, paths.noGitDir);
|
||||
await originalSmartfile.write();
|
||||
await this.smartshellInstance.exec(`ocrmypdf --rotate-pages ${originalPath} ${processedPath}`);
|
||||
const processedSmartfile = await plugins.smartfile.SmartFile.fromFilePath(processedPath);
|
||||
const processedSmartfile = await this.smartfileFactory.fromFilePath(processedPath, paths.noGitDir);
|
||||
await originalSmartfile.delete();
|
||||
await processedSmartfile.delete();
|
||||
return processedSmartfile.contentBuffer;
|
||||
}
|
||||
|
||||
constructor() {
|
||||
public async recognizeImageBufferWithSmartAi(
|
||||
imageBufferArg: Buffer,
|
||||
optionsArg: ISmartOcrImageAiOptions,
|
||||
): Promise<TSmartOcrAiResult> {
|
||||
const smartAiOcrEngine = this.getSmartAiOcrEngine(optionsArg);
|
||||
return smartAiOcrEngine.recognizeImage(
|
||||
{
|
||||
dataBase64: imageBufferArg.toString('base64'),
|
||||
mimeType: optionsArg.mimeType,
|
||||
},
|
||||
optionsArg.recognizeOptions,
|
||||
);
|
||||
}
|
||||
|
||||
public async extractTextFromPdfBufferWithSmartAi(
|
||||
pdfBufferArg: Buffer,
|
||||
optionsArg: ISmartOcrPdfAiOptions,
|
||||
): Promise<string> {
|
||||
return plugins.smartaiDocument.analyzeDocuments({
|
||||
model: optionsArg.model,
|
||||
systemMessage: optionsArg.systemMessage ?? defaultSmartAiPdfOcrSystemMessage,
|
||||
userMessage: optionsArg.userMessage ?? defaultSmartAiPdfOcrUserMessage,
|
||||
pdfDocuments: [pdfBufferArg],
|
||||
messageHistory: optionsArg.messageHistory,
|
||||
});
|
||||
}
|
||||
|
||||
constructor(optionsArg: ISmartOcrConstructorOptions = {}) {
|
||||
this.options = optionsArg;
|
||||
this.smartshellInstance = new plugins.smartshell.Smartshell({
|
||||
executor: 'bash'
|
||||
});
|
||||
}
|
||||
|
||||
public async init() {
|
||||
await plugins.smartfile.fs.ensureDir(paths.noGitDir);
|
||||
await this.smartfileFactory.getSmartFs().directory(paths.noGitDir).recursive().create();
|
||||
const result = await plugins.smartshell.which('ocrmypdf');
|
||||
}
|
||||
}
|
||||
|
||||
private getSmartAiOcrEngine(optionsArg: ISmartOcrImageAiOptions): plugins.smartaiOcr.ISmartAiOcrEngine {
|
||||
if (optionsArg.smartAiOcrEngine) {
|
||||
return optionsArg.smartAiOcrEngine;
|
||||
}
|
||||
if (this.options.smartAiOcrEngine) {
|
||||
return this.options.smartAiOcrEngine;
|
||||
}
|
||||
|
||||
const mistralOcrOptions = {
|
||||
...this.options.mistralOcrOptions,
|
||||
...optionsArg.mistralOcrOptions,
|
||||
};
|
||||
const apiKey = mistralOcrOptions.apiKey ?? process.env.MISTRAL_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('SmartAI OCR requires smartAiOcrEngine, mistralOcrOptions.apiKey, or MISTRAL_API_KEY.');
|
||||
}
|
||||
|
||||
return plugins.smartaiOcr.createMistralOcrEngine({
|
||||
...mistralOcrOptions,
|
||||
apiKey,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
import type { ModelMessage, LanguageModelV3 } from '@push.rocks/smartai';
|
||||
import type {
|
||||
ISmartAiMistralOcrOptions,
|
||||
ISmartAiMistralOcrRecognizeOptions,
|
||||
ISmartAiOcrEngine,
|
||||
ISmartAiOcrResult,
|
||||
TSmartAiOcrImageMimeType,
|
||||
} from '@push.rocks/smartai/ocr';
|
||||
|
||||
export interface ISmartOcrConstructorOptions {
|
||||
smartAiOcrEngine?: ISmartAiOcrEngine;
|
||||
mistralOcrOptions?: ISmartAiMistralOcrOptions;
|
||||
}
|
||||
|
||||
export interface ISmartOcrImageAiOptions {
|
||||
mimeType: TSmartAiOcrImageMimeType;
|
||||
smartAiOcrEngine?: ISmartAiOcrEngine;
|
||||
mistralOcrOptions?: ISmartAiMistralOcrOptions;
|
||||
recognizeOptions?: ISmartAiMistralOcrRecognizeOptions;
|
||||
}
|
||||
|
||||
export interface ISmartOcrPdfAiOptions {
|
||||
model: LanguageModelV3;
|
||||
systemMessage?: string;
|
||||
userMessage?: string;
|
||||
messageHistory?: ModelMessage[];
|
||||
}
|
||||
|
||||
export type TSmartOcrAiResult<TRaw = unknown> = ISmartAiOcrResult<TRaw>;
|
||||
@@ -7,6 +7,8 @@ export {
|
||||
|
||||
// @pushrocks scope
|
||||
import * as smartfile from '@push.rocks/smartfile';
|
||||
import * as smartaiDocument from '@push.rocks/smartai/document';
|
||||
import * as smartaiOcr from '@push.rocks/smartai/ocr';
|
||||
import * as smartshell from '@push.rocks/smartshell';
|
||||
import * as smartunique from '@push.rocks/smartunique';
|
||||
import * as smartpath from '@push.rocks/smartpath';
|
||||
@@ -14,6 +16,8 @@ import * as smartpromise from '@push.rocks/smartpromise';
|
||||
|
||||
export {
|
||||
smartfile,
|
||||
smartaiDocument,
|
||||
smartaiOcr,
|
||||
smartshell,
|
||||
smartunique,
|
||||
smartpath,
|
||||
|
||||
Reference in New Issue
Block a user