smartpdf/ts/smartpdf.classes.smartpdf.ts

352 lines
12 KiB
TypeScript

import * as plugins from './smartpdf.plugins.js';
import * as paths from './smartpdf.paths.js';
import { Server } from 'http';
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
import { type IPdf } from '@tsclass/tsclass/dist_ts/business/pdf.js';
import { execFile } from 'child_process';
declare const document: any;
export class SmartPdf {
// STATIC
public static async create() {
const smartpdfInstance = new SmartPdf();
return smartpdfInstance;
}
// INSTANCE
htmlServerInstance: Server;
serverPort: number;
headlessBrowser: plugins.smartpuppeteer.puppeteer.Browser;
externalBrowserBool: boolean = false;
private _readyDeferred: plugins.smartpromise.Deferred<void>;
private _candidates: { [key: string]: PdfCandidate } = {};
constructor() {
this._readyDeferred = new plugins.smartpromise.Deferred();
}
async start(headlessBrowserArg?: plugins.smartpuppeteer.puppeteer.Browser) {
const done = plugins.smartpromise.defer();
// lets set the external browser in case one is provided
this.headlessBrowser = headlessBrowserArg;
// setup puppeteer
if (this.headlessBrowser) {
this.externalBrowserBool = true;
} else {
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
forceNoSandbox: false,
});
}
// setup server
const app = plugins.express();
app.get('/:pdfId', (req, res) => {
const wantedCandidate = this._candidates[req.params.pdfId];
if (!wantedCandidate) {
console.log(`${req.url} not attached to a candidate`);
return;
}
res.setHeader('pdf-id', wantedCandidate.pdfId);
res.send(wantedCandidate.htmlString);
});
this.htmlServerInstance = plugins.http.createServer(app);
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
const portAvailable = smartnetworkInstance.isLocalPortUnused(3210);
this.htmlServerInstance.listen(3210, 'localhost');
this.htmlServerInstance.on('listening', () => {
this._readyDeferred.resolve();
done.resolve();
});
await done.promise;
}
// stop
async stop() {
const done = plugins.smartpromise.defer<void>();
this.htmlServerInstance.close(() => {
done.resolve();
});
if (!this.externalBrowserBool) {
await this.headlessBrowser.close();
}
await done.promise;
}
/**
* Returns a PDF for a given HTML string.
*/
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<plugins.tsclass.business.IPdf> {
await this._readyDeferred.promise;
const pdfCandidate = new PdfCandidate(htmlStringArg);
this._candidates[pdfCandidate.pdfId] = pdfCandidate;
const page = await this.headlessBrowser.newPage();
await page.setViewport({
width: 794,
height: 1122,
});
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
waitUntil: 'networkidle2',
});
const headers = response.headers();
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
return;
} else {
console.log(`id security check passed for ${pdfCandidate.pdfId}`);
}
const pdfBuffer = await page.pdf({
width: 794,
height: 1122,
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve();
await pdfCandidate.doneDeferred.promise;
return {
id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
};
}
async getPdfResultForWebsite(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
const page = await this.headlessBrowser.newPage();
await page.setViewport({
width: 1980,
height: 1200,
});
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: document.height,
documentWidth: document.width,
};
});
const pdfBuffer = await page.pdf({
height: documentHeight,
width: documentWidth,
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
};
}
async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
const page = await this.headlessBrowser.newPage();
await page.setViewport({
width: 1920,
height: 1200,
});
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
// Use both document.body and document.documentElement to ensure we have a valid height and width.
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: Math.max(
document.body.scrollHeight,
document.documentElement.scrollHeight
) || 1200,
documentWidth: Math.max(
document.body.clientWidth,
document.documentElement.clientWidth
) || 1920,
};
});
// Update viewport height to the full document height.
await page.setViewport({
width: 1920,
height: documentHeight,
});
const pdfBuffer = await page.pdf({
height: documentHeight,
width: 1920,
printBackground: true,
displayHeaderFooter: false,
scale: 1,
pageRanges: '1',
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
};
}
public async mergePdfs(inputPdfBuffers: Uint8Array[]): Promise<Uint8Array> {
const mergedPdf = await plugins.pdfLib.PDFDocument.create();
for (const pdfBytes of inputPdfBuffers) {
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
const pages = await mergedPdf.copyPages(pdfDoc, pdfDoc.getPageIndices());
pages.forEach((page) => mergedPdf.addPage(page));
}
const mergedPdfBytes = await mergedPdf.save();
return mergedPdfBytes;
}
public async readFileToPdfObject(pathArg: string): Promise<plugins.tsclass.business.IPdf> {
const absolutePath = plugins.smartpath.transform.makeAbsolute(pathArg);
const parsedPath = plugins.path.parse(absolutePath);
const buffer = await plugins.smartfile.fs.toBuffer(absolutePath);
return {
name: parsedPath.base,
buffer,
id: null,
metadata: null,
};
}
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
let finalText = '';
for (const page of pdfData.Pages) {
for (const text of page.Texts) {
for (const letter of text.R) {
finalText = finalText + letter.T;
}
}
}
deferred.resolve(finalText);
});
pdfParser.parseBuffer(pdfBufferArg);
return deferred.promise;
}
/**
* Checks for the presence of required dependencies: GraphicsMagick and Ghostscript.
*/
private async checkDependencies(): Promise<void> {
await Promise.all([
this.checkCommandExists('gm', ['version']),
this.checkCommandExists('gs', ['--version']),
]);
}
/**
* Checks if a given command exists by trying to execute it.
*/
private checkCommandExists(command: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => {
execFile(command, args, (error, stdout, stderr) => {
if (error) {
reject(new Error(`Dependency check failed: ${command} is not installed or not in the PATH. ${error.message}`));
} else {
resolve();
}
});
});
}
/**
* Converts a PDF to PNG bytes for each page using Puppeteer and PDF.js.
* This method creates a temporary HTML page that loads PDF.js from a CDN,
* renders each PDF page to a canvas, and then screenshots each canvas element.
*/
public async convertPDFToPngBytes(
pdfBytes: Uint8Array,
options: { width?: number; height?: number; quality?: number } = {}
): Promise<Uint8Array[]> {
// Note: options.width, options.height, and options.quality are not applied here,
// as the rendered canvas size is determined by the PDF page dimensions.
// Create a new page using the headless browser.
const page = await this.headlessBrowser.newPage();
// Prepare PDF data as a base64 string.
const base64Pdf: string = Buffer.from(pdfBytes).toString('base64');
// HTML template that loads PDF.js and renders the PDF.
const htmlTemplate: string = `
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>PDF to PNG Converter</title>
<style>
body { margin: 0; }
canvas { display: block; margin: 10px auto; }
</style>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.min.js"></script>
</head>
<body>
<script>
(async function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
const pdfData = "__PDF_DATA__";
const raw = atob(pdfData);
const pdfArray = new Uint8Array([...raw].map(c => c.charCodeAt(0)));
const loadingTask = pdfjsLib.getDocument({data: pdfArray});
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
await page.render({ canvasContext: context, viewport: viewport }).promise;
document.body.appendChild(canvas);
}
window.renderComplete = true;
})();
</script>
</body>
</html>
`;
// Replace the placeholder with the actual base64 PDF data.
const htmlContent: string = htmlTemplate.replace("__PDF_DATA__", base64Pdf);
// Set the page content.
await page.setContent(htmlContent, { waitUntil: 'networkidle0' });
// Wait until the PDF.js rendering is complete.
await page.waitForFunction(() => (window as any).renderComplete === true, { timeout: 30000 });
// Query all canvas elements (each representing a rendered PDF page).
const canvasElements = await page.$$('canvas');
const pngBuffers: Uint8Array[] = [];
for (const canvasElement of canvasElements) {
// Screenshot the canvas element. The screenshot will be a PNG buffer.
const screenshotBuffer = (await canvasElement.screenshot({ encoding: 'binary' })) as Buffer;
pngBuffers.push(new Uint8Array(screenshotBuffer));
}
await page.close();
return pngBuffers;
}
}