diff --git a/package-lock.json b/package-lock.json index 9bb4924..b1a5d75 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,7 +16,8 @@ "@pushrocks/smartunique": "^3.0.3", "@types/express": "^4.17.13", "express": "^4.17.2", - "pdf-merger-js": "^3.2.1" + "pdf-merger-js": "^3.2.1", + "pdf2json": "^2.0.0" }, "devDependencies": { "@gitzone/tsbuild": "^2.1.28", @@ -10978,6 +10979,33 @@ "pdfjs": "^2.4.5" } }, + "node_modules/pdf2json": { + "version": "2.0.0", + "resolved": "https://verdaccio.lossless.one/pdf2json/-/pdf2json-2.0.0.tgz", + "integrity": "sha512-+FZy7GSvLOLc+zksg0SoMvXqIqcku5lBlEPuYJJkhMWB2x6yfthzEhhSbZc20UheClMPagH/+NXnMRbvQMQR1w==", + "bundleDependencies": [ + "@xmldom/xmldom" + ], + "license": "Apache-2.0", + "dependencies": { + "@xmldom/xmldom": "^0.7.5" + }, + "bin": { + "pdf2json": "bin/pdf2json" + }, + "engines": { + "node": ">=14.18.0", + "npm": ">=6.14.15" + } + }, + "node_modules/pdf2json/node_modules/@xmldom/xmldom": { + "version": "0.7.5", + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/pdfjs": { "version": "2.4.6", "resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz", @@ -23633,6 +23661,20 @@ "pdfjs": "^2.4.5" } }, + "pdf2json": { + "version": "2.0.0", + "resolved": "https://verdaccio.lossless.one/pdf2json/-/pdf2json-2.0.0.tgz", + "integrity": "sha512-+FZy7GSvLOLc+zksg0SoMvXqIqcku5lBlEPuYJJkhMWB2x6yfthzEhhSbZc20UheClMPagH/+NXnMRbvQMQR1w==", + "requires": { + "@xmldom/xmldom": "^0.7.5" + }, + "dependencies": { + "@xmldom/xmldom": { + "version": "0.7.5", + "bundled": true + } + } + }, "pdfjs": { "version": "2.4.6", "resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz", diff --git a/package.json b/package.json index c58d952..5bd0edf 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,8 @@ "@pushrocks/smartunique": "^3.0.3", "@types/express": "^4.17.13", "express": "^4.17.2", - "pdf-merger-js": "^3.2.1" + "pdf-merger-js": "^3.2.1", + "pdf2json": "^2.0.0" }, "files": [ "ts/**/*", diff --git a/test/test.ts b/test/test.ts index 34579f7..51ce055 100644 --- a/test/test.ts +++ b/test/test.ts @@ -16,6 +16,10 @@ tap.test('should create a pdf from html string', async () => { await testSmartPdf.getPdfResultForHtmlString('hi'); }); +tap.test('should create a pdf from html string', async () => { + await testSmartPdf.getPdfResultForHtmlString('hi'); +}); + tap.test('should create a pdf from website as A4', async () => { await testSmartPdf.getPdfResultForWebsite('https://www.wikipedia.org'); }); diff --git a/ts/interfaces/interface.pdfresult.ts b/ts/interfaces/interface.pdfresult.ts index 688cfbd..2470181 100644 --- a/ts/interfaces/interface.pdfresult.ts +++ b/ts/interfaces/interface.pdfresult.ts @@ -1,5 +1,8 @@ export interface IPdfResult { name: string; id: string; + metadata: { + textExtraction: string; + }; buffer: Buffer; } diff --git a/ts/smartpdf.classes.smartpdf.ts b/ts/smartpdf.classes.smartpdf.ts index 2c5df26..05c3839 100644 --- a/ts/smartpdf.classes.smartpdf.ts +++ b/ts/smartpdf.classes.smartpdf.ts @@ -70,8 +70,8 @@ export class SmartPdf { const page = await this.headlessBrowser.newPage(); await page.setViewport({ width: 794, - height: 1122 - }) + height: 1122, + }); const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, { waitUntil: 'networkidle2', }); @@ -96,6 +96,9 @@ export class SmartPdf { return { id: pdfCandidate.pdfId, name: `${pdfCandidate.pdfId}.js`, + metadata: { + textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), + }, buffer: pdfBuffer, }; } @@ -123,11 +126,14 @@ export class SmartPdf { return { id: pdfId, name: `${pdfId}.js`, + metadata: { + textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), + }, buffer: pdfBuffer, }; } - async getFullWebsiteAsSinglePdf(websiteUrl: string) { + async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise { const page = await this.headlessBrowser.newPage(); page.emulateMediaType('screen'); const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' }); @@ -150,6 +156,9 @@ export class SmartPdf { return { id: pdfId, name: `${pdfId}.js`, + metadata: { + textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), + }, buffer: pdfBuffer, }; } @@ -161,4 +170,23 @@ export class SmartPdf { } return merger.saveAsBuffer(); } + + public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise { + const deferred = plugins.smartpromise.defer(); + const pdfParser: any = new plugins.pdf2json(); + pdfParser.on('pdfParser_dataReady', (pdfData: any) => { + let finalText = '' + for (const page of pdfData.Pages) { + for(const text of page.Texts) { + for (const letter of text.R) { + finalText = finalText + letter.T; + } + + }; + } + deferred.resolve(finalText); + }); + pdfParser.parseBuffer(pdfBufferArg); + return deferred.promise; + } } diff --git a/ts/smartpdf.plugins.ts b/ts/smartpdf.plugins.ts index fa2e990..9034630 100644 --- a/ts/smartpdf.plugins.ts +++ b/ts/smartpdf.plugins.ts @@ -15,6 +15,8 @@ export { smartfile, smartpromise, smartpuppeteer, smartunique, smartnetwork }; // thirdparty import pdfMerger from 'pdf-merger-js'; +// @ts-ignore +import pdf2json from 'pdf2json'; import express from 'express'; -export { pdfMerger, express }; +export { pdfMerger, pdf2json, express };