|
|
|
@@ -1,40 +1,102 @@
|
|
|
|
|
import * as plugins from './smartocr.plugins.js';
|
|
|
|
|
import * as paths from './smartocr.paths.js';
|
|
|
|
|
import type {
|
|
|
|
|
ISmartOcrConstructorOptions,
|
|
|
|
|
ISmartOcrImageAiOptions,
|
|
|
|
|
ISmartOcrPdfAiOptions,
|
|
|
|
|
TSmartOcrAiResult,
|
|
|
|
|
} from './smartocr.interfaces.js';
|
|
|
|
|
|
|
|
|
|
const defaultSmartAiPdfOcrSystemMessage = 'You are a precise OCR engine. Extract text faithfully and do not summarize.';
|
|
|
|
|
const defaultSmartAiPdfOcrUserMessage = 'Extract all readable text from this PDF. Preserve page order, line breaks, table structure, and obvious headings. Return only the extracted text.';
|
|
|
|
|
|
|
|
|
|
export class SmartOcr {
|
|
|
|
|
// STATIC
|
|
|
|
|
public static async createAndInit() {
|
|
|
|
|
const smartocrInstance = new SmartOcr();
|
|
|
|
|
public static async createAndInit(optionsArg: ISmartOcrConstructorOptions = {}) {
|
|
|
|
|
const smartocrInstance = new SmartOcr(optionsArg);
|
|
|
|
|
await smartocrInstance.init();
|
|
|
|
|
return smartocrInstance;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// INSTANCE
|
|
|
|
|
|
|
|
|
|
private options: ISmartOcrConstructorOptions;
|
|
|
|
|
public readyDeferred = plugins.smartpromise.defer();
|
|
|
|
|
public smartfileFactory = plugins.smartfile.SmartFileFactory.nodeFs();
|
|
|
|
|
public smartshellInstance: plugins.smartshell.Smartshell;
|
|
|
|
|
|
|
|
|
|
public async processPdfBuffer (pdfBufferArg: Buffer): Promise<Buffer> {
|
|
|
|
|
const uniqueString = plugins.smartunique.uni('doc_');
|
|
|
|
|
const originalPath = plugins.path.join(paths.noGitDir, `${uniqueString}.pdf`);
|
|
|
|
|
const processedPath = plugins.path.join(paths.noGitDir, `${uniqueString}_processed.pdf`);
|
|
|
|
|
const originalSmartfile = await plugins.smartfile.SmartFile.fromBuffer(originalPath, pdfBufferArg);
|
|
|
|
|
const originalSmartfile = this.smartfileFactory.fromBuffer(originalPath, pdfBufferArg, paths.noGitDir);
|
|
|
|
|
await originalSmartfile.write();
|
|
|
|
|
await this.smartshellInstance.exec(`ocrmypdf --rotate-pages ${originalPath} ${processedPath}`);
|
|
|
|
|
const processedSmartfile = await plugins.smartfile.SmartFile.fromFilePath(processedPath);
|
|
|
|
|
const processedSmartfile = await this.smartfileFactory.fromFilePath(processedPath, paths.noGitDir);
|
|
|
|
|
await originalSmartfile.delete();
|
|
|
|
|
await processedSmartfile.delete();
|
|
|
|
|
return processedSmartfile.contentBuffer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructor() {
|
|
|
|
|
public async recognizeImageBufferWithSmartAi(
|
|
|
|
|
imageBufferArg: Buffer,
|
|
|
|
|
optionsArg: ISmartOcrImageAiOptions,
|
|
|
|
|
): Promise<TSmartOcrAiResult> {
|
|
|
|
|
const smartAiOcrEngine = this.getSmartAiOcrEngine(optionsArg);
|
|
|
|
|
return smartAiOcrEngine.recognizeImage(
|
|
|
|
|
{
|
|
|
|
|
dataBase64: imageBufferArg.toString('base64'),
|
|
|
|
|
mimeType: optionsArg.mimeType,
|
|
|
|
|
},
|
|
|
|
|
optionsArg.recognizeOptions,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public async extractTextFromPdfBufferWithSmartAi(
|
|
|
|
|
pdfBufferArg: Buffer,
|
|
|
|
|
optionsArg: ISmartOcrPdfAiOptions,
|
|
|
|
|
): Promise<string> {
|
|
|
|
|
return plugins.smartaiDocument.analyzeDocuments({
|
|
|
|
|
model: optionsArg.model,
|
|
|
|
|
systemMessage: optionsArg.systemMessage ?? defaultSmartAiPdfOcrSystemMessage,
|
|
|
|
|
userMessage: optionsArg.userMessage ?? defaultSmartAiPdfOcrUserMessage,
|
|
|
|
|
pdfDocuments: [pdfBufferArg],
|
|
|
|
|
messageHistory: optionsArg.messageHistory,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constructor(optionsArg: ISmartOcrConstructorOptions = {}) {
|
|
|
|
|
this.options = optionsArg;
|
|
|
|
|
this.smartshellInstance = new plugins.smartshell.Smartshell({
|
|
|
|
|
executor: 'bash'
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public async init() {
|
|
|
|
|
await plugins.smartfile.fs.ensureDir(paths.noGitDir);
|
|
|
|
|
await this.smartfileFactory.getSmartFs().directory(paths.noGitDir).recursive().create();
|
|
|
|
|
const result = await plugins.smartshell.which('ocrmypdf');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private getSmartAiOcrEngine(optionsArg: ISmartOcrImageAiOptions): plugins.smartaiOcr.ISmartAiOcrEngine {
|
|
|
|
|
if (optionsArg.smartAiOcrEngine) {
|
|
|
|
|
return optionsArg.smartAiOcrEngine;
|
|
|
|
|
}
|
|
|
|
|
if (this.options.smartAiOcrEngine) {
|
|
|
|
|
return this.options.smartAiOcrEngine;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const mistralOcrOptions = {
|
|
|
|
|
...this.options.mistralOcrOptions,
|
|
|
|
|
...optionsArg.mistralOcrOptions,
|
|
|
|
|
};
|
|
|
|
|
const apiKey = mistralOcrOptions.apiKey ?? process.env.MISTRAL_API_KEY;
|
|
|
|
|
if (!apiKey) {
|
|
|
|
|
throw new Error('SmartAI OCR requires smartAiOcrEngine, mistralOcrOptions.apiKey, or MISTRAL_API_KEY.');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return plugins.smartaiOcr.createMistralOcrEngine({
|
|
|
|
|
...mistralOcrOptions,
|
|
|
|
|
apiKey,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|