smartpdf/ts/smartpdf.classes.smartpdf.ts

203 lines
6.1 KiB
TypeScript
Raw Normal View History

2022-03-24 13:32:49 +00:00
import * as plugins from './smartpdf.plugins.js';
import * as paths from './smartpdf.paths.js';
2018-10-06 13:25:45 +00:00
import { Server } from 'http';
2022-03-24 13:32:49 +00:00
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
2018-10-06 13:25:45 +00:00
2021-10-14 08:59:45 +00:00
declare const document: any;
2019-05-28 22:27:43 +00:00
2022-03-24 13:32:49 +00:00
import * as interfaces from './interfaces/index.js';
2019-05-29 17:49:23 +00:00
2018-10-06 13:25:45 +00:00
export class SmartPdf {
htmlServerInstance: Server;
serverPort: number;
2019-06-03 21:34:33 +00:00
headlessBrowser: plugins.smartpuppeteer.puppeteer.Browser;
2019-06-03 15:09:16 +00:00
externalBrowserBool: boolean = false;
2018-10-06 13:25:45 +00:00
private _readyDeferred: plugins.smartpromise.Deferred<void>;
2018-10-06 15:35:26 +00:00
private _candidates: { [key: string]: PdfCandidate } = {};
2018-10-06 13:25:45 +00:00
2019-06-03 11:56:43 +00:00
constructor() {
2018-10-06 13:25:45 +00:00
this._readyDeferred = new plugins.smartpromise.Deferred();
}
2021-10-14 08:59:45 +00:00
async start(headlessBrowserArg?: plugins.smartpuppeteer.puppeteer.Browser) {
2019-06-03 11:56:43 +00:00
// lets set the external browser in case one is provided
2019-06-04 09:29:30 +00:00
this.headlessBrowser = headlessBrowserArg;
2018-10-06 13:25:45 +00:00
// setup puppeteer
2019-06-03 15:09:16 +00:00
if (this.headlessBrowser) {
this.externalBrowserBool = true;
} else {
2019-11-15 18:59:57 +00:00
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
2021-03-05 15:38:11 +00:00
forceNoSandbox: true,
2019-11-15 18:59:57 +00:00
});
2019-06-03 11:00:06 +00:00
}
2018-10-06 13:25:45 +00:00
// setup server
const app = plugins.express();
app.get('/:pdfId', (req, res) => {
res.setHeader('PDF-ID', this._candidates[req.params.pdfId].pdfId);
res.send(this._candidates[req.params.pdfId].htmlString);
});
this.htmlServerInstance = plugins.http.createServer(app);
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
2021-04-14 11:34:33 +00:00
const portAvailable = smartnetworkInstance.isLocalPortUnused(3210);
2018-10-06 13:25:45 +00:00
this.htmlServerInstance.listen(3210, 'localhost');
this.htmlServerInstance.on('listening', () => {
this._readyDeferred.resolve();
});
}
2019-06-03 14:39:21 +00:00
// stop
2019-05-29 12:14:02 +00:00
async stop() {
2018-10-06 15:35:26 +00:00
const done = plugins.smartpromise.defer<void>();
this.htmlServerInstance.close(() => {
done.resolve();
});
2019-06-04 09:29:30 +00:00
2019-06-03 15:09:16 +00:00
if (!this.externalBrowserBool) {
2019-06-03 11:02:01 +00:00
await this.headlessBrowser.close();
}
2018-10-06 15:35:26 +00:00
await done.promise;
2019-05-28 21:57:50 +00:00
}
2018-10-06 15:35:26 +00:00
2018-10-06 13:25:45 +00:00
/**
* returns a pdf for a given html string;
*/
2022-01-06 12:10:12 +00:00
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<interfaces.IPdfResult> {
2018-10-06 13:25:45 +00:00
await this._readyDeferred.promise;
const pdfCandidate = new PdfCandidate(htmlStringArg);
this._candidates[pdfCandidate.pdfId] = pdfCandidate;
const page = await this.headlessBrowser.newPage();
2022-01-05 13:17:43 +00:00
await page.setViewport({
2022-01-05 15:32:47 +00:00
width: 794,
2022-01-05 16:20:28 +00:00
height: 1122,
});
2018-10-06 15:35:26 +00:00
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
2021-03-05 15:38:11 +00:00
waitUntil: 'networkidle2',
2018-10-06 15:35:26 +00:00
});
2018-10-06 13:25:45 +00:00
const headers = response.headers();
2018-10-06 15:35:26 +00:00
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
2018-10-06 13:25:45 +00:00
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
2018-10-06 15:35:26 +00:00
return;
2018-10-06 13:25:45 +00:00
} else {
console.log(`id security check passed for ${pdfCandidate.pdfId}`);
}
2018-10-06 15:35:26 +00:00
2019-05-29 17:49:23 +00:00
const pdfBuffer = await page.pdf({
2022-01-06 12:10:12 +00:00
width: 794,
height: 1122,
2019-11-12 14:41:58 +00:00
printBackground: true,
displayHeaderFooter: false,
2018-10-06 13:25:45 +00:00
});
await page.close();
delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve();
await pdfCandidate.doneDeferred.promise;
2019-05-29 17:49:23 +00:00
return {
id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`,
2022-01-05 16:20:28 +00:00
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
2021-03-05 15:38:11 +00:00
buffer: pdfBuffer,
2019-06-03 08:51:15 +00:00
};
2018-10-06 13:25:45 +00:00
}
async getPdfResultForWebsite(websiteUrl: string): Promise<interfaces.IPdfResult> {
2018-10-06 13:25:45 +00:00
const page = await this.headlessBrowser.newPage();
2022-01-06 12:10:12 +00:00
await page.setViewport({
width: 1980,
height: 1200,
});
2021-03-05 15:16:22 +00:00
await page.emulateMediaType('screen');
2018-10-06 13:25:45 +00:00
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
2019-11-12 14:45:26 +00:00
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: document.height,
2021-03-05 15:38:11 +00:00
documentWidth: document.width,
2019-11-12 14:45:26 +00:00
};
});
2019-05-29 17:49:23 +00:00
const pdfBuffer = await page.pdf({
2022-01-06 12:10:12 +00:00
height: documentHeight,
2019-11-12 14:45:26 +00:00
width: documentWidth,
2019-04-10 13:12:54 +00:00
printBackground: true,
displayHeaderFooter: false,
2018-10-06 13:25:45 +00:00
});
await page.close();
2019-05-29 17:49:23 +00:00
return {
id: pdfId,
name: `${pdfId}.js`,
2022-01-05 16:20:28 +00:00
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
2021-03-05 15:38:11 +00:00
buffer: pdfBuffer,
2019-06-03 08:51:15 +00:00
};
2018-10-06 13:25:45 +00:00
}
2019-05-28 22:27:43 +00:00
2022-01-05 16:20:28 +00:00
async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<interfaces.IPdfResult> {
2019-05-28 22:27:43 +00:00
const page = await this.headlessBrowser.newPage();
2022-01-06 12:10:12 +00:00
await page.setViewport({
width: 1920,
height: 1200,
});
2021-03-05 15:16:22 +00:00
page.emulateMediaType('screen');
2019-05-28 22:27:43 +00:00
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
2019-06-03 08:51:15 +00:00
const { documentHeight, documentWidth } = await page.evaluate(() => {
2019-05-28 22:27:43 +00:00
return {
2022-01-06 12:10:12 +00:00
documentHeight: document.body.scrollHeight,
documentWidth: document.body.clientWidth,
2019-05-28 22:27:43 +00:00
};
});
2022-01-06 12:23:04 +00:00
await page.setViewport({
width: 1920,
height: documentHeight,
});
2019-05-29 17:49:23 +00:00
const pdfBuffer = await page.pdf({
2022-01-06 12:10:12 +00:00
height: documentHeight,
width: 1920,
2019-05-28 22:27:43 +00:00
printBackground: true,
displayHeaderFooter: false,
2022-01-06 12:23:04 +00:00
scale: 1,
pageRanges: '1'
2019-05-28 22:27:43 +00:00
});
await page.close();
2019-05-29 17:49:23 +00:00
return {
id: pdfId,
name: `${pdfId}.js`,
2022-01-05 16:20:28 +00:00
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
},
2021-03-05 15:38:11 +00:00
buffer: pdfBuffer,
2019-06-03 08:51:15 +00:00
};
2019-05-28 22:27:43 +00:00
}
2021-10-14 08:59:45 +00:00
public async mergePdfBuffers(pdfBuffers: Buffer[]): Promise<Buffer> {
const merger = new plugins.pdfMerger();
for (const buffer of pdfBuffers) {
merger.add(buffer);
}
return merger.saveAsBuffer();
}
2022-01-05 16:20:28 +00:00
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
let finalText = ''
for (const page of pdfData.Pages) {
for(const text of page.Texts) {
for (const letter of text.R) {
finalText = finalText + letter.T;
}
};
}
deferred.resolve(finalText);
});
pdfParser.parseBuffer(pdfBufferArg);
return deferred.promise;
}
2018-10-06 13:25:45 +00:00
}