2022-03-24 13:32:49 +00:00
|
|
|
import * as plugins from './smartpdf.plugins.js';
|
|
|
|
import * as paths from './smartpdf.paths.js';
|
2018-10-06 13:25:45 +00:00
|
|
|
import { Server } from 'http';
|
2022-03-24 13:32:49 +00:00
|
|
|
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
|
2023-07-26 12:17:11 +00:00
|
|
|
import { type IPdf } from '@tsclass/tsclass/dist_ts/business/pdf.js';
|
2018-10-06 13:25:45 +00:00
|
|
|
|
2021-10-14 08:59:45 +00:00
|
|
|
declare const document: any;
|
2019-05-28 22:27:43 +00:00
|
|
|
|
2018-10-06 13:25:45 +00:00
|
|
|
export class SmartPdf {
|
2022-06-15 20:14:55 +00:00
|
|
|
// STATIC
|
|
|
|
public static async create() {
|
|
|
|
const smartpdfInstance = new SmartPdf();
|
|
|
|
return smartpdfInstance;
|
|
|
|
}
|
|
|
|
|
|
|
|
// INSTANCE
|
2018-10-06 13:25:45 +00:00
|
|
|
htmlServerInstance: Server;
|
|
|
|
serverPort: number;
|
2019-06-03 21:34:33 +00:00
|
|
|
headlessBrowser: plugins.smartpuppeteer.puppeteer.Browser;
|
2019-06-03 15:09:16 +00:00
|
|
|
externalBrowserBool: boolean = false;
|
2018-10-06 13:25:45 +00:00
|
|
|
private _readyDeferred: plugins.smartpromise.Deferred<void>;
|
2018-10-06 15:35:26 +00:00
|
|
|
private _candidates: { [key: string]: PdfCandidate } = {};
|
2018-10-06 13:25:45 +00:00
|
|
|
|
2019-06-03 11:56:43 +00:00
|
|
|
constructor() {
|
2018-10-06 13:25:45 +00:00
|
|
|
this._readyDeferred = new plugins.smartpromise.Deferred();
|
|
|
|
}
|
|
|
|
|
2021-10-14 08:59:45 +00:00
|
|
|
async start(headlessBrowserArg?: plugins.smartpuppeteer.puppeteer.Browser) {
|
2022-06-28 22:24:43 +00:00
|
|
|
const done = plugins.smartpromise.defer();
|
2019-06-03 11:56:43 +00:00
|
|
|
// lets set the external browser in case one is provided
|
2019-06-04 09:29:30 +00:00
|
|
|
this.headlessBrowser = headlessBrowserArg;
|
2018-10-06 13:25:45 +00:00
|
|
|
// setup puppeteer
|
2019-06-03 15:09:16 +00:00
|
|
|
if (this.headlessBrowser) {
|
|
|
|
this.externalBrowserBool = true;
|
|
|
|
} else {
|
2019-11-15 18:59:57 +00:00
|
|
|
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
|
2021-03-05 15:38:11 +00:00
|
|
|
forceNoSandbox: true,
|
2019-11-15 18:59:57 +00:00
|
|
|
});
|
2019-06-03 11:00:06 +00:00
|
|
|
}
|
2018-10-06 13:25:45 +00:00
|
|
|
|
|
|
|
// setup server
|
|
|
|
const app = plugins.express();
|
|
|
|
app.get('/:pdfId', (req, res) => {
|
2024-11-30 19:43:05 +00:00
|
|
|
const wantedCandidate = this._candidates[req.params.pdfId];
|
|
|
|
if (!wantedCandidate) {
|
|
|
|
console.log(`${req.url} not attached to a candidate`);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
res.setHeader('pdf-id', wantedCandidate.pdfId);
|
|
|
|
res.send(wantedCandidate.htmlString);
|
2018-10-06 13:25:45 +00:00
|
|
|
});
|
|
|
|
this.htmlServerInstance = plugins.http.createServer(app);
|
|
|
|
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
|
2021-04-14 11:34:33 +00:00
|
|
|
const portAvailable = smartnetworkInstance.isLocalPortUnused(3210);
|
2018-10-06 13:25:45 +00:00
|
|
|
this.htmlServerInstance.listen(3210, 'localhost');
|
|
|
|
this.htmlServerInstance.on('listening', () => {
|
|
|
|
this._readyDeferred.resolve();
|
2022-06-28 22:24:43 +00:00
|
|
|
done.resolve();
|
2018-10-06 13:25:45 +00:00
|
|
|
});
|
2022-06-28 22:24:43 +00:00
|
|
|
await done.promise;
|
2018-10-06 13:25:45 +00:00
|
|
|
}
|
|
|
|
|
2019-06-03 14:39:21 +00:00
|
|
|
// stop
|
2019-05-29 12:14:02 +00:00
|
|
|
async stop() {
|
2018-10-06 15:35:26 +00:00
|
|
|
const done = plugins.smartpromise.defer<void>();
|
|
|
|
this.htmlServerInstance.close(() => {
|
|
|
|
done.resolve();
|
|
|
|
});
|
2019-06-04 09:29:30 +00:00
|
|
|
|
2019-06-03 15:09:16 +00:00
|
|
|
if (!this.externalBrowserBool) {
|
2019-06-03 11:02:01 +00:00
|
|
|
await this.headlessBrowser.close();
|
|
|
|
}
|
|
|
|
|
2018-10-06 15:35:26 +00:00
|
|
|
await done.promise;
|
2019-05-28 21:57:50 +00:00
|
|
|
}
|
2018-10-06 15:35:26 +00:00
|
|
|
|
2018-10-06 13:25:45 +00:00
|
|
|
/**
|
|
|
|
* returns a pdf for a given html string;
|
|
|
|
*/
|
2022-06-15 20:14:55 +00:00
|
|
|
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<plugins.tsclass.business.IPdf> {
|
2018-10-06 13:25:45 +00:00
|
|
|
await this._readyDeferred.promise;
|
|
|
|
const pdfCandidate = new PdfCandidate(htmlStringArg);
|
|
|
|
this._candidates[pdfCandidate.pdfId] = pdfCandidate;
|
|
|
|
const page = await this.headlessBrowser.newPage();
|
2022-01-05 13:17:43 +00:00
|
|
|
await page.setViewport({
|
2022-01-05 15:32:47 +00:00
|
|
|
width: 794,
|
2022-01-05 16:20:28 +00:00
|
|
|
height: 1122,
|
|
|
|
});
|
2018-10-06 15:35:26 +00:00
|
|
|
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
|
2021-03-05 15:38:11 +00:00
|
|
|
waitUntil: 'networkidle2',
|
2018-10-06 15:35:26 +00:00
|
|
|
});
|
2018-10-06 13:25:45 +00:00
|
|
|
const headers = response.headers();
|
2018-10-06 15:35:26 +00:00
|
|
|
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
|
2018-10-06 13:25:45 +00:00
|
|
|
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
|
2018-10-06 15:35:26 +00:00
|
|
|
return;
|
2018-10-06 13:25:45 +00:00
|
|
|
} else {
|
|
|
|
console.log(`id security check passed for ${pdfCandidate.pdfId}`);
|
|
|
|
}
|
2018-10-06 15:35:26 +00:00
|
|
|
|
2019-05-29 17:49:23 +00:00
|
|
|
const pdfBuffer = await page.pdf({
|
2022-01-06 12:10:12 +00:00
|
|
|
width: 794,
|
|
|
|
height: 1122,
|
2019-11-12 14:41:58 +00:00
|
|
|
printBackground: true,
|
|
|
|
displayHeaderFooter: false,
|
2018-10-06 13:25:45 +00:00
|
|
|
});
|
|
|
|
await page.close();
|
|
|
|
delete this._candidates[pdfCandidate.pdfId];
|
|
|
|
pdfCandidate.doneDeferred.resolve();
|
|
|
|
await pdfCandidate.doneDeferred.promise;
|
2019-05-29 17:49:23 +00:00
|
|
|
return {
|
|
|
|
id: pdfCandidate.pdfId,
|
|
|
|
name: `${pdfCandidate.pdfId}.js`,
|
2022-01-05 16:20:28 +00:00
|
|
|
metadata: {
|
|
|
|
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
|
|
|
|
},
|
2021-03-05 15:38:11 +00:00
|
|
|
buffer: pdfBuffer,
|
2019-06-03 08:51:15 +00:00
|
|
|
};
|
2018-10-06 13:25:45 +00:00
|
|
|
}
|
|
|
|
|
2022-06-15 20:14:55 +00:00
|
|
|
async getPdfResultForWebsite(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
|
2018-10-06 13:25:45 +00:00
|
|
|
const page = await this.headlessBrowser.newPage();
|
2022-01-06 12:10:12 +00:00
|
|
|
await page.setViewport({
|
|
|
|
width: 1980,
|
|
|
|
height: 1200,
|
|
|
|
});
|
2021-03-05 15:16:22 +00:00
|
|
|
await page.emulateMediaType('screen');
|
2018-10-06 13:25:45 +00:00
|
|
|
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
|
|
|
|
const pdfId = plugins.smartunique.shortId();
|
2019-11-12 14:45:26 +00:00
|
|
|
const { documentHeight, documentWidth } = await page.evaluate(() => {
|
|
|
|
return {
|
|
|
|
documentHeight: document.height,
|
2021-03-05 15:38:11 +00:00
|
|
|
documentWidth: document.width,
|
2019-11-12 14:45:26 +00:00
|
|
|
};
|
|
|
|
});
|
2019-05-29 17:49:23 +00:00
|
|
|
const pdfBuffer = await page.pdf({
|
2022-01-06 12:10:12 +00:00
|
|
|
height: documentHeight,
|
2019-11-12 14:45:26 +00:00
|
|
|
width: documentWidth,
|
2019-04-10 13:12:54 +00:00
|
|
|
printBackground: true,
|
|
|
|
displayHeaderFooter: false,
|
2018-10-06 13:25:45 +00:00
|
|
|
});
|
|
|
|
await page.close();
|
2019-05-29 17:49:23 +00:00
|
|
|
return {
|
|
|
|
id: pdfId,
|
|
|
|
name: `${pdfId}.js`,
|
2022-01-05 16:20:28 +00:00
|
|
|
metadata: {
|
|
|
|
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
|
|
|
|
},
|
2021-03-05 15:38:11 +00:00
|
|
|
buffer: pdfBuffer,
|
2019-06-03 08:51:15 +00:00
|
|
|
};
|
2018-10-06 13:25:45 +00:00
|
|
|
}
|
2019-05-28 22:27:43 +00:00
|
|
|
|
2022-06-15 20:14:55 +00:00
|
|
|
async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
|
2019-05-28 22:27:43 +00:00
|
|
|
const page = await this.headlessBrowser.newPage();
|
2022-01-06 12:10:12 +00:00
|
|
|
await page.setViewport({
|
|
|
|
width: 1920,
|
|
|
|
height: 1200,
|
|
|
|
});
|
2021-03-05 15:16:22 +00:00
|
|
|
page.emulateMediaType('screen');
|
2019-05-28 22:27:43 +00:00
|
|
|
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
|
|
|
|
const pdfId = plugins.smartunique.shortId();
|
2019-06-03 08:51:15 +00:00
|
|
|
const { documentHeight, documentWidth } = await page.evaluate(() => {
|
2019-05-28 22:27:43 +00:00
|
|
|
return {
|
2022-01-06 12:10:12 +00:00
|
|
|
documentHeight: document.body.scrollHeight,
|
|
|
|
documentWidth: document.body.clientWidth,
|
2019-05-28 22:27:43 +00:00
|
|
|
};
|
|
|
|
});
|
2022-01-06 12:23:04 +00:00
|
|
|
await page.setViewport({
|
|
|
|
width: 1920,
|
|
|
|
height: documentHeight,
|
|
|
|
});
|
2019-05-29 17:49:23 +00:00
|
|
|
const pdfBuffer = await page.pdf({
|
2022-01-06 12:10:12 +00:00
|
|
|
height: documentHeight,
|
|
|
|
width: 1920,
|
2019-05-28 22:27:43 +00:00
|
|
|
printBackground: true,
|
|
|
|
displayHeaderFooter: false,
|
2022-01-06 12:23:04 +00:00
|
|
|
scale: 1,
|
2022-06-15 20:14:55 +00:00
|
|
|
pageRanges: '1',
|
2019-05-28 22:27:43 +00:00
|
|
|
});
|
|
|
|
await page.close();
|
2019-05-29 17:49:23 +00:00
|
|
|
return {
|
|
|
|
id: pdfId,
|
|
|
|
name: `${pdfId}.js`,
|
2022-01-05 16:20:28 +00:00
|
|
|
metadata: {
|
|
|
|
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
|
|
|
|
},
|
2021-03-05 15:38:11 +00:00
|
|
|
buffer: pdfBuffer,
|
2019-06-03 08:51:15 +00:00
|
|
|
};
|
2019-05-28 22:27:43 +00:00
|
|
|
}
|
2021-10-14 08:59:45 +00:00
|
|
|
|
2024-04-25 16:48:08 +00:00
|
|
|
public async mergePdfs(inputPdfBuffers: Uint8Array[]): Promise<Uint8Array> {
|
|
|
|
const mergedPdf = await plugins.pdfLib.PDFDocument.create();
|
|
|
|
for (const pdfBytes of inputPdfBuffers) {
|
|
|
|
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
|
|
|
|
const pages = await mergedPdf.copyPages(pdfDoc, pdfDoc.getPageIndices());
|
|
|
|
pages.forEach((page) => mergedPdf.addPage(page));
|
2022-06-15 20:14:55 +00:00
|
|
|
}
|
2024-04-25 16:48:08 +00:00
|
|
|
|
|
|
|
const mergedPdfBytes = await mergedPdf.save();
|
|
|
|
return mergedPdfBytes;
|
2022-06-15 20:14:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public async readFileToPdfObject(pathArg: string): Promise<plugins.tsclass.business.IPdf> {
|
|
|
|
const path = plugins.smartpath.transform.makeAbsolute(pathArg);
|
|
|
|
const parsedPath = plugins.path.parse(path);
|
|
|
|
const buffer = await plugins.smartfile.fs.toBuffer(path);
|
|
|
|
return {
|
|
|
|
name: parsedPath.base,
|
|
|
|
buffer,
|
|
|
|
id: null,
|
2022-10-26 21:04:59 +00:00
|
|
|
metadata: null,
|
|
|
|
};
|
2021-10-14 08:59:45 +00:00
|
|
|
}
|
2022-01-05 16:20:28 +00:00
|
|
|
|
|
|
|
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
|
|
|
|
const deferred = plugins.smartpromise.defer<string>();
|
|
|
|
const pdfParser: any = new plugins.pdf2json();
|
|
|
|
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
|
2022-06-15 20:14:55 +00:00
|
|
|
let finalText = '';
|
2022-01-05 16:20:28 +00:00
|
|
|
for (const page of pdfData.Pages) {
|
2022-06-15 20:14:55 +00:00
|
|
|
for (const text of page.Texts) {
|
2022-01-05 16:20:28 +00:00
|
|
|
for (const letter of text.R) {
|
|
|
|
finalText = finalText + letter.T;
|
|
|
|
}
|
2022-06-15 20:14:55 +00:00
|
|
|
}
|
2022-01-05 16:20:28 +00:00
|
|
|
}
|
|
|
|
deferred.resolve(finalText);
|
|
|
|
});
|
|
|
|
pdfParser.parseBuffer(pdfBufferArg);
|
|
|
|
return deferred.promise;
|
|
|
|
}
|
2024-04-25 16:48:08 +00:00
|
|
|
|
2024-04-27 10:07:16 +00:00
|
|
|
public async convertPDFToPngBytes(
|
2024-04-25 16:48:08 +00:00
|
|
|
pdfBytes: Uint8Array,
|
|
|
|
options: {
|
|
|
|
width?: number;
|
|
|
|
height?: number;
|
|
|
|
quality?: number;
|
|
|
|
} = {}
|
2024-04-26 11:39:57 +00:00
|
|
|
) {
|
2024-04-25 16:48:08 +00:00
|
|
|
const { width = 1024, height = 768, quality = 100 } = options;
|
|
|
|
|
|
|
|
// Load the PDF document
|
|
|
|
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
|
|
|
|
|
|
|
|
const converter = plugins.pdf2pic.fromBuffer(Buffer.from(pdfBytes), {
|
|
|
|
density: 100, // Image density (DPI)
|
2024-04-27 10:07:16 +00:00
|
|
|
format: 'png', // Image format
|
2024-04-25 16:48:08 +00:00
|
|
|
width, // Output image width
|
|
|
|
height, // Output image height
|
|
|
|
quality, // Output image quality
|
|
|
|
});
|
|
|
|
|
|
|
|
// Get array promises that resolve to JPG buffers
|
|
|
|
const imagePromises: Promise<Buffer>[] = [];
|
|
|
|
const numPages = pdfDoc.getPageCount();
|
|
|
|
|
|
|
|
for (let i = 0; i < numPages; i++) {
|
|
|
|
imagePromises.push(converter(i + 1, {
|
|
|
|
responseType: 'buffer',
|
|
|
|
}).then((output) => output.buffer));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Resolve all promises and return the array of buffers
|
2024-04-26 11:39:57 +00:00
|
|
|
const imageBuffers = await Promise.all(imagePromises);
|
2024-04-27 10:07:16 +00:00
|
|
|
const imageUint8Arrays = imageBuffers.map((buffer) => buffer);
|
2024-04-26 11:39:57 +00:00
|
|
|
return imageUint8Arrays;
|
2024-04-25 16:48:08 +00:00
|
|
|
}
|
2018-10-06 13:25:45 +00:00
|
|
|
}
|