fix(core): update

This commit is contained in:
Philipp Kunz 2022-01-05 17:20:28 +01:00
parent ff890fb2af
commit 567c6eafea
6 changed files with 86 additions and 6 deletions

44
package-lock.json generated
View File

@ -16,7 +16,8 @@
"@pushrocks/smartunique": "^3.0.3", "@pushrocks/smartunique": "^3.0.3",
"@types/express": "^4.17.13", "@types/express": "^4.17.13",
"express": "^4.17.2", "express": "^4.17.2",
"pdf-merger-js": "^3.2.1" "pdf-merger-js": "^3.2.1",
"pdf2json": "^2.0.0"
}, },
"devDependencies": { "devDependencies": {
"@gitzone/tsbuild": "^2.1.28", "@gitzone/tsbuild": "^2.1.28",
@ -10978,6 +10979,33 @@
"pdfjs": "^2.4.5" "pdfjs": "^2.4.5"
} }
}, },
"node_modules/pdf2json": {
"version": "2.0.0",
"resolved": "https://verdaccio.lossless.one/pdf2json/-/pdf2json-2.0.0.tgz",
"integrity": "sha512-+FZy7GSvLOLc+zksg0SoMvXqIqcku5lBlEPuYJJkhMWB2x6yfthzEhhSbZc20UheClMPagH/+NXnMRbvQMQR1w==",
"bundleDependencies": [
"@xmldom/xmldom"
],
"license": "Apache-2.0",
"dependencies": {
"@xmldom/xmldom": "^0.7.5"
},
"bin": {
"pdf2json": "bin/pdf2json"
},
"engines": {
"node": ">=14.18.0",
"npm": ">=6.14.15"
}
},
"node_modules/pdf2json/node_modules/@xmldom/xmldom": {
"version": "0.7.5",
"inBundle": true,
"license": "MIT",
"engines": {
"node": ">=10.0.0"
}
},
"node_modules/pdfjs": { "node_modules/pdfjs": {
"version": "2.4.6", "version": "2.4.6",
"resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz", "resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz",
@ -23633,6 +23661,20 @@
"pdfjs": "^2.4.5" "pdfjs": "^2.4.5"
} }
}, },
"pdf2json": {
"version": "2.0.0",
"resolved": "https://verdaccio.lossless.one/pdf2json/-/pdf2json-2.0.0.tgz",
"integrity": "sha512-+FZy7GSvLOLc+zksg0SoMvXqIqcku5lBlEPuYJJkhMWB2x6yfthzEhhSbZc20UheClMPagH/+NXnMRbvQMQR1w==",
"requires": {
"@xmldom/xmldom": "^0.7.5"
},
"dependencies": {
"@xmldom/xmldom": {
"version": "0.7.5",
"bundled": true
}
}
},
"pdfjs": { "pdfjs": {
"version": "2.4.6", "version": "2.4.6",
"resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz", "resolved": "https://verdaccio.lossless.one/pdfjs/-/pdfjs-2.4.6.tgz",

View File

@ -28,7 +28,8 @@
"@pushrocks/smartunique": "^3.0.3", "@pushrocks/smartunique": "^3.0.3",
"@types/express": "^4.17.13", "@types/express": "^4.17.13",
"express": "^4.17.2", "express": "^4.17.2",
"pdf-merger-js": "^3.2.1" "pdf-merger-js": "^3.2.1",
"pdf2json": "^2.0.0"
}, },
"files": [ "files": [
"ts/**/*", "ts/**/*",

View File

@ -16,6 +16,10 @@ tap.test('should create a pdf from html string', async () => {
await testSmartPdf.getPdfResultForHtmlString('hi'); await testSmartPdf.getPdfResultForHtmlString('hi');
}); });
tap.test('should create a pdf from html string', async () => {
await testSmartPdf.getPdfResultForHtmlString('hi');
});
tap.test('should create a pdf from website as A4', async () => { tap.test('should create a pdf from website as A4', async () => {
await testSmartPdf.getPdfResultForWebsite('https://www.wikipedia.org'); await testSmartPdf.getPdfResultForWebsite('https://www.wikipedia.org');
}); });

View File

@ -1,5 +1,8 @@
export interface IPdfResult { export interface IPdfResult {
name: string; name: string;
id: string; id: string;
metadata: {
textExtraction: string;
};
buffer: Buffer; buffer: Buffer;
} }

View File

@ -70,8 +70,8 @@ export class SmartPdf {
const page = await this.headlessBrowser.newPage(); const page = await this.headlessBrowser.newPage();
await page.setViewport({ await page.setViewport({
width: 794, width: 794,
height: 1122 height: 1122,
}) });
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, { const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
waitUntil: 'networkidle2', waitUntil: 'networkidle2',
}); });
@ -96,6 +96,9 @@ export class SmartPdf {
return { return {
id: pdfCandidate.pdfId, id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`, name: `${pdfCandidate.pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
buffer: pdfBuffer, buffer: pdfBuffer,
}; };
} }
@ -123,11 +126,14 @@ export class SmartPdf {
return { return {
id: pdfId, id: pdfId,
name: `${pdfId}.js`, name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
buffer: pdfBuffer, buffer: pdfBuffer,
}; };
} }
async getFullWebsiteAsSinglePdf(websiteUrl: string) { async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<interfaces.IPdfResult> {
const page = await this.headlessBrowser.newPage(); const page = await this.headlessBrowser.newPage();
page.emulateMediaType('screen'); page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' }); const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
@ -150,6 +156,9 @@ export class SmartPdf {
return { return {
id: pdfId, id: pdfId,
name: `${pdfId}.js`, name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
buffer: pdfBuffer, buffer: pdfBuffer,
}; };
} }
@ -161,4 +170,23 @@ export class SmartPdf {
} }
return merger.saveAsBuffer(); return merger.saveAsBuffer();
} }
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
let finalText = ''
for (const page of pdfData.Pages) {
for(const text of page.Texts) {
for (const letter of text.R) {
finalText = finalText + letter.T;
}
};
}
deferred.resolve(finalText);
});
pdfParser.parseBuffer(pdfBufferArg);
return deferred.promise;
}
} }

View File

@ -15,6 +15,8 @@ export { smartfile, smartpromise, smartpuppeteer, smartunique, smartnetwork };
// thirdparty // thirdparty
import pdfMerger from 'pdf-merger-js'; import pdfMerger from 'pdf-merger-js';
// @ts-ignore
import pdf2json from 'pdf2json';
import express from 'express'; import express from 'express';
export { pdfMerger, express }; export { pdfMerger, pdf2json, express };