feat(ocr): add smartai extraction support
This commit is contained in:
+6
-4
@@ -21,8 +21,9 @@
|
|||||||
"@types/node": "^20.12.7"
|
"@types/node": "^20.12.7"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@push.rocks/smartfile": "^11.0.14",
|
"@push.rocks/smartai": "^4.1.0",
|
||||||
"@push.rocks/smartpath": "^5.0.14",
|
"@push.rocks/smartfile": "^13.1.3",
|
||||||
|
"@push.rocks/smartpath": "^6.0.0",
|
||||||
"@push.rocks/smartpromise": "^4.0.2",
|
"@push.rocks/smartpromise": "^4.0.2",
|
||||||
"@push.rocks/smartshell": "^3.0.3",
|
"@push.rocks/smartshell": "^3.0.3",
|
||||||
"@push.rocks/smartunique": "^3.0.3"
|
"@push.rocks/smartunique": "^3.0.3"
|
||||||
@@ -60,5 +61,6 @@
|
|||||||
"url": "https://code.foss.global/push.rocks/smartocr/issues"
|
"url": "https://code.foss.global/push.rocks/smartocr/issues"
|
||||||
},
|
},
|
||||||
"homepage": "https://code.foss.global/push.rocks/smartocr",
|
"homepage": "https://code.foss.global/push.rocks/smartocr",
|
||||||
"type": "module"
|
"type": "module",
|
||||||
}
|
"packageManager": "pnpm@10.28.2"
|
||||||
|
}
|
||||||
|
|||||||
Generated
+9657
-3337
File diff suppressed because it is too large
Load Diff
@@ -1,18 +1,19 @@
|
|||||||
# @push.rocks/smartocr
|
# @push.rocks/smartocr
|
||||||
an ocr module using ocrmypdf
|
|
||||||
|
OCR utilities for PDF text-layer generation with `ocrmypdf` and optional SmartAI-powered text extraction.
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
To install `@push.rocks/smartocr`, use the following command with npm:
|
To install `@push.rocks/smartocr`, use pnpm:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm install @push.rocks/smartocr --save
|
pnpm install @push.rocks/smartocr
|
||||||
```
|
```
|
||||||
|
|
||||||
This module depends on a few external utilities like `ocrmypdf`, so make sure you have these installed and available in your system's PATH. Consult the `ocrmypdf` documentation for installation instructions suitable for your operating system.
|
This module depends on a few external utilities like `ocrmypdf`, so make sure you have these installed and available in your system's PATH. Consult the `ocrmypdf` documentation for installation instructions suitable for your operating system.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. Here's how to leverage it in your TypeScript project.
|
This module provides a TypeScript interface for OCR processing of PDF documents using `ocrmypdf`, encapsulated in the `SmartOcr` class. It can also call SmartAI OCR for image buffers, or SmartAI document analysis for PDF text extraction with a vision-capable model.
|
||||||
|
|
||||||
### Preparing Your Project
|
### Preparing Your Project
|
||||||
|
|
||||||
@@ -45,6 +46,54 @@ await fs.promises.writeFile('./path/to/output/document_ocr.pdf', ocredPdfBuffer)
|
|||||||
|
|
||||||
In the example above, we import the `SmartOcr` class and use it to process a PDF by passing a `Buffer` of the PDF file to the `processPdfBuffer` method. The method returns a `Buffer` of the processed PDF which includes a text layer added by OCR.
|
In the example above, we import the `SmartOcr` class and use it to process a PDF by passing a `Buffer` of the PDF file to the `processPdfBuffer` method. The method returns a `Buffer` of the processed PDF which includes a text layer added by OCR.
|
||||||
|
|
||||||
|
### SmartAI Image OCR
|
||||||
|
|
||||||
|
For image inputs, use SmartAI's OCR engine. By default this uses the Mistral OCR engine from `@push.rocks/smartai/ocr`; pass `mistralOcrOptions.apiKey`, set `MISTRAL_API_KEY`, or inject a custom `smartAiOcrEngine`.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { SmartOcr } from '@push.rocks/smartocr';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
|
||||||
|
const smartOcr = await SmartOcr.createAndInit({
|
||||||
|
mistralOcrOptions: {
|
||||||
|
apiKey: process.env.MISTRAL_API_KEY,
|
||||||
|
confidenceScoresGranularity: 'page',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const imageBuffer = await fs.promises.readFile('./scan.png');
|
||||||
|
const result = await smartOcr.recognizeImageBufferWithSmartAi(imageBuffer, {
|
||||||
|
mimeType: 'image/png',
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(result.text);
|
||||||
|
console.log(result.confidence);
|
||||||
|
```
|
||||||
|
|
||||||
|
### SmartAI PDF Text Extraction
|
||||||
|
|
||||||
|
For cases where you want extracted text instead of a searchable PDF, pass a SmartAI model to `extractTextFromPdfBufferWithSmartAi()`. This uses `@push.rocks/smartai/document`, which converts PDF pages to images and asks a vision-capable model to extract text.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { SmartOcr } from '@push.rocks/smartocr';
|
||||||
|
import { getModel } from '@push.rocks/smartai';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
|
||||||
|
const smartOcr = await SmartOcr.createAndInit();
|
||||||
|
const model = getModel({
|
||||||
|
provider: 'anthropic',
|
||||||
|
model: 'claude-sonnet-4-5-20250929',
|
||||||
|
apiKey: process.env.ANTHROPIC_TOKEN,
|
||||||
|
});
|
||||||
|
|
||||||
|
const pdfBuffer = await fs.promises.readFile('./scan.pdf');
|
||||||
|
const extractedText = await smartOcr.extractTextFromPdfBufferWithSmartAi(pdfBuffer, {
|
||||||
|
model,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(extractedText);
|
||||||
|
```
|
||||||
|
|
||||||
### Advanced Usage
|
### Advanced Usage
|
||||||
|
|
||||||
The `SmartOcr` class maintains an internal `smartshell` instance to interface with the `ocrmypdf` command. This setup is abstracted away, ensuring you don't need to manage or understand the underlying shell commands to use OCR functionality in your application.
|
The `SmartOcr` class maintains an internal `smartshell` instance to interface with the `ocrmypdf` command. This setup is abstracted away, ensuring you don't need to manage or understand the underlying shell commands to use OCR functionality in your application.
|
||||||
|
|||||||
+45
-3
@@ -1,5 +1,6 @@
|
|||||||
import { expect, tap } from '@push.rocks/tapbundle';
|
import { expect, tap } from '@push.rocks/tapbundle';
|
||||||
import * as smartocr from '../ts/index.js';
|
import * as smartocr from '../ts/index.js';
|
||||||
|
import type { ISmartAiOcrEngine } from '@push.rocks/smartai/ocr';
|
||||||
|
|
||||||
let testOcrInstance: smartocr.SmartOcr;
|
let testOcrInstance: smartocr.SmartOcr;
|
||||||
|
|
||||||
@@ -8,11 +9,52 @@ tap.test('should create a valid instance of Smartocr', async () => {
|
|||||||
expect(testOcrInstance).toBeInstanceOf(smartocr.SmartOcr);
|
expect(testOcrInstance).toBeInstanceOf(smartocr.SmartOcr);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
tap.test('should recognize image buffers through SmartAI OCR', async () => {
|
||||||
|
const calls: unknown[] = [];
|
||||||
|
const smartAiOcrEngine: ISmartAiOcrEngine = {
|
||||||
|
recognizeImage: async (input) => {
|
||||||
|
calls.push(input);
|
||||||
|
return {
|
||||||
|
text: 'hello from smartai',
|
||||||
|
confidence: 0.92,
|
||||||
|
pages: [
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
text: 'hello from smartai',
|
||||||
|
confidence: 0.92,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
raw: {
|
||||||
|
pages: [
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
markdown: 'hello from smartai',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
model: 'mock-ocr',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await testOcrInstance.recognizeImageBufferWithSmartAi(Buffer.from('image data'), {
|
||||||
|
mimeType: 'image/png',
|
||||||
|
smartAiOcrEngine,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.text).toEqual('hello from smartai');
|
||||||
|
expect(result.confidence).toEqual(0.92);
|
||||||
|
expect(calls[0]).toEqual({
|
||||||
|
dataBase64: Buffer.from('image data').toString('base64'),
|
||||||
|
mimeType: 'image/png',
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
tap.test('should ocr a pdfBuffer', async () => {
|
tap.test('should ocr a pdfBuffer', async () => {
|
||||||
const smartfile = await import('@push.rocks/smartfile');
|
const smartfile = await import('@push.rocks/smartfile');
|
||||||
const pdfBuffer = (await smartfile.SmartFile.fromFilePath('./test/demo_without_textlayer.pdf'))
|
const smartfileFactory = smartfile.SmartFileFactory.nodeFs();
|
||||||
.contentBuffer;
|
const pdfFile = await smartfileFactory.fromFilePath('./test/demo_without_textlayer.pdf');
|
||||||
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfBuffer);
|
const resultBuffer = await testOcrInstance.processPdfBuffer(pdfFile.contentBuffer);
|
||||||
console.log(resultBuffer);
|
console.log(resultBuffer);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
export * from './smartocr.classes.smartocr.js';
|
export * from './smartocr.classes.smartocr.js';
|
||||||
|
export type * from './smartocr.interfaces.js';
|
||||||
|
|||||||
@@ -1,40 +1,102 @@
|
|||||||
import * as plugins from './smartocr.plugins.js';
|
import * as plugins from './smartocr.plugins.js';
|
||||||
import * as paths from './smartocr.paths.js';
|
import * as paths from './smartocr.paths.js';
|
||||||
|
import type {
|
||||||
|
ISmartOcrConstructorOptions,
|
||||||
|
ISmartOcrImageAiOptions,
|
||||||
|
ISmartOcrPdfAiOptions,
|
||||||
|
TSmartOcrAiResult,
|
||||||
|
} from './smartocr.interfaces.js';
|
||||||
|
|
||||||
|
const defaultSmartAiPdfOcrSystemMessage = 'You are a precise OCR engine. Extract text faithfully and do not summarize.';
|
||||||
|
const defaultSmartAiPdfOcrUserMessage = 'Extract all readable text from this PDF. Preserve page order, line breaks, table structure, and obvious headings. Return only the extracted text.';
|
||||||
|
|
||||||
export class SmartOcr {
|
export class SmartOcr {
|
||||||
// STATIC
|
// STATIC
|
||||||
public static async createAndInit() {
|
public static async createAndInit(optionsArg: ISmartOcrConstructorOptions = {}) {
|
||||||
const smartocrInstance = new SmartOcr();
|
const smartocrInstance = new SmartOcr(optionsArg);
|
||||||
await smartocrInstance.init();
|
await smartocrInstance.init();
|
||||||
return smartocrInstance;
|
return smartocrInstance;
|
||||||
}
|
}
|
||||||
|
|
||||||
// INSTANCE
|
// INSTANCE
|
||||||
|
|
||||||
|
private options: ISmartOcrConstructorOptions;
|
||||||
public readyDeferred = plugins.smartpromise.defer();
|
public readyDeferred = plugins.smartpromise.defer();
|
||||||
|
public smartfileFactory = plugins.smartfile.SmartFileFactory.nodeFs();
|
||||||
public smartshellInstance: plugins.smartshell.Smartshell;
|
public smartshellInstance: plugins.smartshell.Smartshell;
|
||||||
|
|
||||||
public async processPdfBuffer (pdfBufferArg: Buffer): Promise<Buffer> {
|
public async processPdfBuffer (pdfBufferArg: Buffer): Promise<Buffer> {
|
||||||
const uniqueString = plugins.smartunique.uni('doc_');
|
const uniqueString = plugins.smartunique.uni('doc_');
|
||||||
const originalPath = plugins.path.join(paths.noGitDir, `${uniqueString}.pdf`);
|
const originalPath = plugins.path.join(paths.noGitDir, `${uniqueString}.pdf`);
|
||||||
const processedPath = plugins.path.join(paths.noGitDir, `${uniqueString}_processed.pdf`);
|
const processedPath = plugins.path.join(paths.noGitDir, `${uniqueString}_processed.pdf`);
|
||||||
const originalSmartfile = await plugins.smartfile.SmartFile.fromBuffer(originalPath, pdfBufferArg);
|
const originalSmartfile = this.smartfileFactory.fromBuffer(originalPath, pdfBufferArg, paths.noGitDir);
|
||||||
await originalSmartfile.write();
|
await originalSmartfile.write();
|
||||||
await this.smartshellInstance.exec(`ocrmypdf --rotate-pages ${originalPath} ${processedPath}`);
|
await this.smartshellInstance.exec(`ocrmypdf --rotate-pages ${originalPath} ${processedPath}`);
|
||||||
const processedSmartfile = await plugins.smartfile.SmartFile.fromFilePath(processedPath);
|
const processedSmartfile = await this.smartfileFactory.fromFilePath(processedPath, paths.noGitDir);
|
||||||
await originalSmartfile.delete();
|
await originalSmartfile.delete();
|
||||||
await processedSmartfile.delete();
|
await processedSmartfile.delete();
|
||||||
return processedSmartfile.contentBuffer;
|
return processedSmartfile.contentBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
constructor() {
|
public async recognizeImageBufferWithSmartAi(
|
||||||
|
imageBufferArg: Buffer,
|
||||||
|
optionsArg: ISmartOcrImageAiOptions,
|
||||||
|
): Promise<TSmartOcrAiResult> {
|
||||||
|
const smartAiOcrEngine = this.getSmartAiOcrEngine(optionsArg);
|
||||||
|
return smartAiOcrEngine.recognizeImage(
|
||||||
|
{
|
||||||
|
dataBase64: imageBufferArg.toString('base64'),
|
||||||
|
mimeType: optionsArg.mimeType,
|
||||||
|
},
|
||||||
|
optionsArg.recognizeOptions,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public async extractTextFromPdfBufferWithSmartAi(
|
||||||
|
pdfBufferArg: Buffer,
|
||||||
|
optionsArg: ISmartOcrPdfAiOptions,
|
||||||
|
): Promise<string> {
|
||||||
|
return plugins.smartaiDocument.analyzeDocuments({
|
||||||
|
model: optionsArg.model,
|
||||||
|
systemMessage: optionsArg.systemMessage ?? defaultSmartAiPdfOcrSystemMessage,
|
||||||
|
userMessage: optionsArg.userMessage ?? defaultSmartAiPdfOcrUserMessage,
|
||||||
|
pdfDocuments: [pdfBufferArg],
|
||||||
|
messageHistory: optionsArg.messageHistory,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
constructor(optionsArg: ISmartOcrConstructorOptions = {}) {
|
||||||
|
this.options = optionsArg;
|
||||||
this.smartshellInstance = new plugins.smartshell.Smartshell({
|
this.smartshellInstance = new plugins.smartshell.Smartshell({
|
||||||
executor: 'bash'
|
executor: 'bash'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public async init() {
|
public async init() {
|
||||||
await plugins.smartfile.fs.ensureDir(paths.noGitDir);
|
await this.smartfileFactory.getSmartFs().directory(paths.noGitDir).recursive().create();
|
||||||
const result = await plugins.smartshell.which('ocrmypdf');
|
const result = await plugins.smartshell.which('ocrmypdf');
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
private getSmartAiOcrEngine(optionsArg: ISmartOcrImageAiOptions): plugins.smartaiOcr.ISmartAiOcrEngine {
|
||||||
|
if (optionsArg.smartAiOcrEngine) {
|
||||||
|
return optionsArg.smartAiOcrEngine;
|
||||||
|
}
|
||||||
|
if (this.options.smartAiOcrEngine) {
|
||||||
|
return this.options.smartAiOcrEngine;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mistralOcrOptions = {
|
||||||
|
...this.options.mistralOcrOptions,
|
||||||
|
...optionsArg.mistralOcrOptions,
|
||||||
|
};
|
||||||
|
const apiKey = mistralOcrOptions.apiKey ?? process.env.MISTRAL_API_KEY;
|
||||||
|
if (!apiKey) {
|
||||||
|
throw new Error('SmartAI OCR requires smartAiOcrEngine, mistralOcrOptions.apiKey, or MISTRAL_API_KEY.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return plugins.smartaiOcr.createMistralOcrEngine({
|
||||||
|
...mistralOcrOptions,
|
||||||
|
apiKey,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import type { ModelMessage, LanguageModelV3 } from '@push.rocks/smartai';
|
||||||
|
import type {
|
||||||
|
ISmartAiMistralOcrOptions,
|
||||||
|
ISmartAiMistralOcrRecognizeOptions,
|
||||||
|
ISmartAiOcrEngine,
|
||||||
|
ISmartAiOcrResult,
|
||||||
|
TSmartAiOcrImageMimeType,
|
||||||
|
} from '@push.rocks/smartai/ocr';
|
||||||
|
|
||||||
|
export interface ISmartOcrConstructorOptions {
|
||||||
|
smartAiOcrEngine?: ISmartAiOcrEngine;
|
||||||
|
mistralOcrOptions?: ISmartAiMistralOcrOptions;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartOcrImageAiOptions {
|
||||||
|
mimeType: TSmartAiOcrImageMimeType;
|
||||||
|
smartAiOcrEngine?: ISmartAiOcrEngine;
|
||||||
|
mistralOcrOptions?: ISmartAiMistralOcrOptions;
|
||||||
|
recognizeOptions?: ISmartAiMistralOcrRecognizeOptions;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ISmartOcrPdfAiOptions {
|
||||||
|
model: LanguageModelV3;
|
||||||
|
systemMessage?: string;
|
||||||
|
userMessage?: string;
|
||||||
|
messageHistory?: ModelMessage[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export type TSmartOcrAiResult<TRaw = unknown> = ISmartAiOcrResult<TRaw>;
|
||||||
@@ -7,6 +7,8 @@ export {
|
|||||||
|
|
||||||
// @pushrocks scope
|
// @pushrocks scope
|
||||||
import * as smartfile from '@push.rocks/smartfile';
|
import * as smartfile from '@push.rocks/smartfile';
|
||||||
|
import * as smartaiDocument from '@push.rocks/smartai/document';
|
||||||
|
import * as smartaiOcr from '@push.rocks/smartai/ocr';
|
||||||
import * as smartshell from '@push.rocks/smartshell';
|
import * as smartshell from '@push.rocks/smartshell';
|
||||||
import * as smartunique from '@push.rocks/smartunique';
|
import * as smartunique from '@push.rocks/smartunique';
|
||||||
import * as smartpath from '@push.rocks/smartpath';
|
import * as smartpath from '@push.rocks/smartpath';
|
||||||
@@ -14,6 +16,8 @@ import * as smartpromise from '@push.rocks/smartpromise';
|
|||||||
|
|
||||||
export {
|
export {
|
||||||
smartfile,
|
smartfile,
|
||||||
|
smartaiDocument,
|
||||||
|
smartaiOcr,
|
||||||
smartshell,
|
smartshell,
|
||||||
smartunique,
|
smartunique,
|
||||||
smartpath,
|
smartpath,
|
||||||
|
|||||||
Reference in New Issue
Block a user