352 lines
12 KiB
TypeScript
352 lines
12 KiB
TypeScript
import * as plugins from './smartpdf.plugins.js';
|
|
import * as paths from './smartpdf.paths.js';
|
|
import { Server } from 'http';
|
|
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
|
|
import { type IPdf } from '@tsclass/tsclass/dist_ts/business/pdf.js';
|
|
import { execFile } from 'child_process';
|
|
|
|
declare const document: any;
|
|
|
|
export class SmartPdf {
|
|
// STATIC
|
|
public static async create() {
|
|
const smartpdfInstance = new SmartPdf();
|
|
return smartpdfInstance;
|
|
}
|
|
|
|
// INSTANCE
|
|
htmlServerInstance: Server;
|
|
serverPort: number;
|
|
headlessBrowser: plugins.smartpuppeteer.puppeteer.Browser;
|
|
externalBrowserBool: boolean = false;
|
|
private _readyDeferred: plugins.smartpromise.Deferred<void>;
|
|
private _candidates: { [key: string]: PdfCandidate } = {};
|
|
|
|
constructor() {
|
|
this._readyDeferred = new plugins.smartpromise.Deferred();
|
|
}
|
|
|
|
async start(headlessBrowserArg?: plugins.smartpuppeteer.puppeteer.Browser) {
|
|
const done = plugins.smartpromise.defer();
|
|
// lets set the external browser in case one is provided
|
|
this.headlessBrowser = headlessBrowserArg;
|
|
// setup puppeteer
|
|
if (this.headlessBrowser) {
|
|
this.externalBrowserBool = true;
|
|
} else {
|
|
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
|
|
forceNoSandbox: false,
|
|
});
|
|
}
|
|
|
|
// setup server
|
|
const app = plugins.express();
|
|
app.get('/:pdfId', (req, res) => {
|
|
const wantedCandidate = this._candidates[req.params.pdfId];
|
|
if (!wantedCandidate) {
|
|
console.log(`${req.url} not attached to a candidate`);
|
|
return;
|
|
}
|
|
res.setHeader('pdf-id', wantedCandidate.pdfId);
|
|
res.send(wantedCandidate.htmlString);
|
|
});
|
|
this.htmlServerInstance = plugins.http.createServer(app);
|
|
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
|
|
const portAvailable = smartnetworkInstance.isLocalPortUnused(3210);
|
|
this.htmlServerInstance.listen(3210, 'localhost');
|
|
this.htmlServerInstance.on('listening', () => {
|
|
this._readyDeferred.resolve();
|
|
done.resolve();
|
|
});
|
|
await done.promise;
|
|
}
|
|
|
|
// stop
|
|
async stop() {
|
|
const done = plugins.smartpromise.defer<void>();
|
|
this.htmlServerInstance.close(() => {
|
|
done.resolve();
|
|
});
|
|
|
|
if (!this.externalBrowserBool) {
|
|
await this.headlessBrowser.close();
|
|
}
|
|
|
|
await done.promise;
|
|
}
|
|
|
|
/**
|
|
* Returns a PDF for a given HTML string.
|
|
*/
|
|
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<plugins.tsclass.business.IPdf> {
|
|
await this._readyDeferred.promise;
|
|
const pdfCandidate = new PdfCandidate(htmlStringArg);
|
|
this._candidates[pdfCandidate.pdfId] = pdfCandidate;
|
|
const page = await this.headlessBrowser.newPage();
|
|
await page.setViewport({
|
|
width: 794,
|
|
height: 1122,
|
|
});
|
|
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
|
|
waitUntil: 'networkidle2',
|
|
});
|
|
const headers = response.headers();
|
|
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
|
|
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
|
|
return;
|
|
} else {
|
|
console.log(`id security check passed for ${pdfCandidate.pdfId}`);
|
|
}
|
|
|
|
const pdfBuffer = await page.pdf({
|
|
width: 794,
|
|
height: 1122,
|
|
printBackground: true,
|
|
displayHeaderFooter: false,
|
|
});
|
|
// Convert Uint8Array to Node Buffer
|
|
const nodePdfBuffer = Buffer.from(pdfBuffer);
|
|
await page.close();
|
|
delete this._candidates[pdfCandidate.pdfId];
|
|
pdfCandidate.doneDeferred.resolve();
|
|
await pdfCandidate.doneDeferred.promise;
|
|
return {
|
|
id: pdfCandidate.pdfId,
|
|
name: `${pdfCandidate.pdfId}.js`,
|
|
metadata: {
|
|
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
|
|
},
|
|
buffer: nodePdfBuffer,
|
|
};
|
|
}
|
|
|
|
async getPdfResultForWebsite(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
|
|
const page = await this.headlessBrowser.newPage();
|
|
await page.setViewport({
|
|
width: 1980,
|
|
height: 1200,
|
|
});
|
|
await page.emulateMediaType('screen');
|
|
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
|
|
const pdfId = plugins.smartunique.shortId();
|
|
const { documentHeight, documentWidth } = await page.evaluate(() => {
|
|
return {
|
|
documentHeight: document.height,
|
|
documentWidth: document.width,
|
|
};
|
|
});
|
|
const pdfBuffer = await page.pdf({
|
|
height: documentHeight,
|
|
width: documentWidth,
|
|
printBackground: true,
|
|
displayHeaderFooter: false,
|
|
});
|
|
// Convert Uint8Array to Node Buffer
|
|
const nodePdfBuffer = Buffer.from(pdfBuffer);
|
|
await page.close();
|
|
return {
|
|
id: pdfId,
|
|
name: `${pdfId}.js`,
|
|
metadata: {
|
|
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
|
|
},
|
|
buffer: nodePdfBuffer,
|
|
};
|
|
}
|
|
|
|
async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
|
|
const page = await this.headlessBrowser.newPage();
|
|
await page.setViewport({
|
|
width: 1920,
|
|
height: 1200,
|
|
});
|
|
await page.emulateMediaType('screen');
|
|
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
|
|
const pdfId = plugins.smartunique.shortId();
|
|
// Use both document.body and document.documentElement to ensure we have a valid height and width.
|
|
const { documentHeight, documentWidth } = await page.evaluate(() => {
|
|
return {
|
|
documentHeight: Math.max(
|
|
document.body.scrollHeight,
|
|
document.documentElement.scrollHeight
|
|
) || 1200,
|
|
documentWidth: Math.max(
|
|
document.body.clientWidth,
|
|
document.documentElement.clientWidth
|
|
) || 1920,
|
|
};
|
|
});
|
|
// Update viewport height to the full document height.
|
|
await page.setViewport({
|
|
width: 1920,
|
|
height: documentHeight,
|
|
});
|
|
const pdfBuffer = await page.pdf({
|
|
height: documentHeight,
|
|
width: 1920,
|
|
printBackground: true,
|
|
displayHeaderFooter: false,
|
|
scale: 1,
|
|
pageRanges: '1',
|
|
});
|
|
// Convert Uint8Array to Node Buffer
|
|
const nodePdfBuffer = Buffer.from(pdfBuffer);
|
|
await page.close();
|
|
return {
|
|
id: pdfId,
|
|
name: `${pdfId}.js`,
|
|
metadata: {
|
|
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
|
|
},
|
|
buffer: nodePdfBuffer,
|
|
};
|
|
}
|
|
|
|
public async mergePdfs(inputPdfBuffers: Uint8Array[]): Promise<Uint8Array> {
|
|
const mergedPdf = await plugins.pdfLib.PDFDocument.create();
|
|
for (const pdfBytes of inputPdfBuffers) {
|
|
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
|
|
const pages = await mergedPdf.copyPages(pdfDoc, pdfDoc.getPageIndices());
|
|
pages.forEach((page) => mergedPdf.addPage(page));
|
|
}
|
|
|
|
const mergedPdfBytes = await mergedPdf.save();
|
|
return mergedPdfBytes;
|
|
}
|
|
|
|
public async readFileToPdfObject(pathArg: string): Promise<plugins.tsclass.business.IPdf> {
|
|
const absolutePath = plugins.smartpath.transform.makeAbsolute(pathArg);
|
|
const parsedPath = plugins.path.parse(absolutePath);
|
|
const buffer = await plugins.smartfile.fs.toBuffer(absolutePath);
|
|
return {
|
|
name: parsedPath.base,
|
|
buffer,
|
|
id: null,
|
|
metadata: null,
|
|
};
|
|
}
|
|
|
|
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
|
|
const deferred = plugins.smartpromise.defer<string>();
|
|
const pdfParser: any = new plugins.pdf2json();
|
|
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
|
|
let finalText = '';
|
|
for (const page of pdfData.Pages) {
|
|
for (const text of page.Texts) {
|
|
for (const letter of text.R) {
|
|
finalText = finalText + letter.T;
|
|
}
|
|
}
|
|
}
|
|
deferred.resolve(finalText);
|
|
});
|
|
pdfParser.parseBuffer(pdfBufferArg);
|
|
return deferred.promise;
|
|
}
|
|
|
|
/**
|
|
* Checks for the presence of required dependencies: GraphicsMagick and Ghostscript.
|
|
*/
|
|
private async checkDependencies(): Promise<void> {
|
|
await Promise.all([
|
|
this.checkCommandExists('gm', ['version']),
|
|
this.checkCommandExists('gs', ['--version']),
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Checks if a given command exists by trying to execute it.
|
|
*/
|
|
private checkCommandExists(command: string, args: string[]): Promise<void> {
|
|
return new Promise((resolve, reject) => {
|
|
execFile(command, args, (error, stdout, stderr) => {
|
|
if (error) {
|
|
reject(new Error(`Dependency check failed: ${command} is not installed or not in the PATH. ${error.message}`));
|
|
} else {
|
|
resolve();
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Converts a PDF to PNG bytes for each page using Puppeteer and PDF.js.
|
|
* This method creates a temporary HTML page that loads PDF.js from a CDN,
|
|
* renders each PDF page to a canvas, and then screenshots each canvas element.
|
|
*/
|
|
public async convertPDFToPngBytes(
|
|
pdfBytes: Uint8Array,
|
|
options: { width?: number; height?: number; quality?: number } = {}
|
|
): Promise<Uint8Array[]> {
|
|
// Note: options.width, options.height, and options.quality are not applied here,
|
|
// as the rendered canvas size is determined by the PDF page dimensions.
|
|
|
|
// Create a new page using the headless browser.
|
|
const page = await this.headlessBrowser.newPage();
|
|
|
|
// Prepare PDF data as a base64 string.
|
|
const base64Pdf: string = Buffer.from(pdfBytes).toString('base64');
|
|
|
|
// HTML template that loads PDF.js and renders the PDF.
|
|
const htmlTemplate: string = `
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>PDF to PNG Converter</title>
|
|
<style>
|
|
body { margin: 0; }
|
|
canvas { display: block; margin: 10px auto; }
|
|
</style>
|
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.min.js"></script>
|
|
</head>
|
|
<body>
|
|
<script>
|
|
(async function() {
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
|
|
const pdfData = "__PDF_DATA__";
|
|
const raw = atob(pdfData);
|
|
const pdfArray = new Uint8Array([...raw].map(c => c.charCodeAt(0)));
|
|
const loadingTask = pdfjsLib.getDocument({data: pdfArray});
|
|
const pdf = await loadingTask.promise;
|
|
const numPages = pdf.numPages;
|
|
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
const page = await pdf.getPage(pageNum);
|
|
const viewport = page.getViewport({ scale: 1.0 });
|
|
const canvas = document.createElement('canvas');
|
|
const context = canvas.getContext('2d');
|
|
canvas.width = viewport.width;
|
|
canvas.height = viewport.height;
|
|
await page.render({ canvasContext: context, viewport: viewport }).promise;
|
|
document.body.appendChild(canvas);
|
|
}
|
|
window.renderComplete = true;
|
|
})();
|
|
</script>
|
|
</body>
|
|
</html>
|
|
`;
|
|
|
|
// Replace the placeholder with the actual base64 PDF data.
|
|
const htmlContent: string = htmlTemplate.replace("__PDF_DATA__", base64Pdf);
|
|
|
|
// Set the page content.
|
|
await page.setContent(htmlContent, { waitUntil: 'networkidle0' });
|
|
|
|
// Wait until the PDF.js rendering is complete.
|
|
await page.waitForFunction(() => (window as any).renderComplete === true, { timeout: 30000 });
|
|
|
|
// Query all canvas elements (each representing a rendered PDF page).
|
|
const canvasElements = await page.$$('canvas');
|
|
const pngBuffers: Uint8Array[] = [];
|
|
|
|
for (const canvasElement of canvasElements) {
|
|
// Screenshot the canvas element. The screenshot will be a PNG buffer.
|
|
const screenshotBuffer = (await canvasElement.screenshot({ encoding: 'binary' })) as Buffer;
|
|
pngBuffers.push(new Uint8Array(screenshotBuffer));
|
|
}
|
|
|
|
await page.close();
|
|
return pngBuffers;
|
|
}
|
|
} |