Files
smartpdf/ts/smartpdf.classes.smartpdf.ts

617 lines
22 KiB
TypeScript
Raw Normal View History

2022-03-24 14:32:49 +01:00
import * as plugins from './smartpdf.plugins.js';
import * as paths from './smartpdf.paths.js';
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
2023-07-26 14:17:11 +02:00
import { type IPdf } from '@tsclass/tsclass/dist_ts/business/pdf.js';
2021-10-14 10:59:45 +02:00
declare const document: any;
2019-05-29 00:27:43 +02:00
export interface ISmartPdfOptions {
port?: number;
portRangeStart?: number;
portRangeEnd?: number;
}
2018-10-06 13:25:45 +00:00
export class SmartPdf {
// STATIC SCALE CONSTANTS
public static readonly SCALE_SCREEN = 2.0; // ~144 DPI - Good for screen display
public static readonly SCALE_HIGH = 3.0; // ~216 DPI - High quality (default)
public static readonly SCALE_PRINT = 6.0; // ~432 DPI - Print quality
/**
* Calculate scale factor for desired DPI
* PDF.js default is 72 DPI, so scale = desiredDPI / 72
*/
public static getScaleForDPI(dpi: number): number {
return dpi / 72;
}
2022-06-15 22:14:55 +02:00
// STATIC
public static async create(optionsArg?: ISmartPdfOptions) {
const smartpdfInstance = new SmartPdf(optionsArg);
2022-06-15 22:14:55 +02:00
return smartpdfInstance;
}
// INSTANCE
private smartserveInstance: plugins.smartserve.SmartServe;
2018-10-06 13:25:45 +00:00
serverPort: number;
2019-06-03 23:34:33 +02:00
headlessBrowser: plugins.smartpuppeteer.puppeteer.Browser;
2019-06-03 17:09:16 +02:00
externalBrowserBool: boolean = false;
2018-10-06 13:25:45 +00:00
private _readyDeferred: plugins.smartpromise.Deferred<void>;
2018-10-06 17:35:26 +02:00
private _candidates: { [key: string]: PdfCandidate } = {};
private _options: ISmartPdfOptions;
private _isRunning: boolean = false;
2018-10-06 13:25:45 +00:00
constructor(optionsArg?: ISmartPdfOptions) {
2018-10-06 13:25:45 +00:00
this._readyDeferred = new plugins.smartpromise.Deferred();
this._options = {
portRangeStart: 20000,
portRangeEnd: 30000,
...optionsArg
};
2018-10-06 13:25:45 +00:00
}
2021-10-14 10:59:45 +02:00
async start(headlessBrowserArg?: plugins.smartpuppeteer.puppeteer.Browser) {
if (this._isRunning) {
throw new Error('SmartPdf is already running. Call stop() before starting again.');
}
// Reset readiness deferred for this start cycle
this._readyDeferred = new plugins.smartpromise.Deferred();
2019-06-03 13:56:43 +02:00
// lets set the external browser in case one is provided
2019-06-04 11:29:30 +02:00
this.headlessBrowser = headlessBrowserArg;
2018-10-06 13:25:45 +00:00
// setup puppeteer
2019-06-03 17:09:16 +02:00
if (this.headlessBrowser) {
this.externalBrowserBool = true;
} else {
2019-11-15 19:59:57 +01:00
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
forceNoSandbox: false,
2019-11-15 19:59:57 +01:00
});
2019-06-03 13:00:06 +02:00
}
2018-10-06 13:25:45 +00:00
// Find an available port BEFORE creating server
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
if (this._options.port) {
// If a specific port is requested, check if it's available
const isPortAvailable = await smartnetworkInstance.isLocalPortUnused(this._options.port);
if (isPortAvailable) {
this.serverPort = this._options.port;
} else {
// Clean up browser if we created one
if (!this.externalBrowserBool && this.headlessBrowser) {
await this.headlessBrowser.close();
this.headlessBrowser = null;
}
throw new Error(`Requested port ${this._options.port} is already in use`);
}
} else {
// Find a free port in the specified range
this.serverPort = await smartnetworkInstance.findFreePort(
this._options.portRangeStart,
this._options.portRangeEnd
);
if (!this.serverPort) {
// Clean up browser if we created one
if (!this.externalBrowserBool && this.headlessBrowser) {
await this.headlessBrowser.close();
this.headlessBrowser = null;
}
throw new Error(`No free ports available in range ${this._options.portRangeStart}-${this._options.portRangeEnd}`);
}
}
// Now setup server using smartserve
this.smartserveInstance = new plugins.smartserve.SmartServe({
port: this.serverPort,
hostname: 'localhost',
2018-10-06 13:25:45 +00:00
});
this.smartserveInstance.setHandler(async (request) => {
const url = new URL(request.url);
const pdfId = url.pathname.slice(1); // Remove leading /
const candidate = this._candidates[pdfId];
if (!candidate) {
console.log(`${url.pathname} not attached to a candidate`);
return new Response('Not found', { status: 404 });
}
return new Response(candidate.htmlString, {
headers: {
'Content-Type': 'text/html; charset=utf-8',
'pdf-id': candidate.pdfId,
},
});
2018-10-06 13:25:45 +00:00
});
await this.smartserveInstance.start();
console.log(`SmartPdf server listening on port ${this.serverPort}`);
this._isRunning = true;
this._readyDeferred.resolve();
2018-10-06 13:25:45 +00:00
}
2019-06-03 16:39:21 +02:00
// stop
2019-05-29 14:14:02 +02:00
async stop() {
if (!this._isRunning) {
return;
}
this._isRunning = false;
2019-06-04 11:29:30 +02:00
// Close browser first to cleanly terminate keepalive connections
// before the server shuts down (prevents ECONNRESET errors)
if (!this.externalBrowserBool && this.headlessBrowser) {
2019-06-03 13:02:01 +02:00
await this.headlessBrowser.close();
}
this.headlessBrowser = null;
2019-06-03 13:02:01 +02:00
if (this.smartserveInstance) {
await this.smartserveInstance.stop();
this.smartserveInstance = null;
}
// Clear any remaining candidates
this._candidates = {};
2019-05-28 23:57:50 +02:00
}
2018-10-06 17:35:26 +02:00
2018-10-06 13:25:45 +00:00
/**
* Returns a PDF for a given HTML string.
2018-10-06 13:25:45 +00:00
*/
2022-06-15 22:14:55 +02:00
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<plugins.tsclass.business.IPdf> {
2018-10-06 13:25:45 +00:00
await this._readyDeferred.promise;
const pdfCandidate = new PdfCandidate(htmlStringArg);
this._candidates[pdfCandidate.pdfId] = pdfCandidate;
let page: plugins.smartpuppeteer.puppeteer.Page;
try {
page = await this.headlessBrowser.newPage();
await page.setViewport({
width: 794,
height: 1122,
});
const response = await page.goto(`http://localhost:${this.serverPort}/${pdfCandidate.pdfId}`, {
waitUntil: 'networkidle2',
});
const headers = response.headers();
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
return;
} else {
console.log(`id security check passed for ${pdfCandidate.pdfId}`);
}
2018-10-06 17:35:26 +02:00
const pdfBuffer = await page.pdf({
width: 794,
height: 1122,
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve();
await pdfCandidate.doneDeferred.promise;
return {
id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
};
} catch (err) {
// Clean up candidate on error
delete this._candidates[pdfCandidate.pdfId];
if (page) {
await page.close().catch(() => {});
}
throw err;
}
2018-10-06 13:25:45 +00:00
}
2022-06-15 22:14:55 +02:00
async getPdfResultForWebsite(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
2018-10-06 13:25:45 +00:00
const page = await this.headlessBrowser.newPage();
try {
await page.setViewport({
width: 1980,
height: 1200,
});
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: document.height,
documentWidth: document.width,
};
});
const pdfBuffer = await page.pdf({
height: documentHeight,
width: documentWidth,
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
2019-11-12 15:45:26 +01:00
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
2019-11-12 15:45:26 +01:00
};
} catch (err) {
await page.close().catch(() => {});
throw err;
}
2018-10-06 13:25:45 +00:00
}
2019-05-29 00:27:43 +02:00
2022-06-15 22:14:55 +02:00
async getFullWebsiteAsSinglePdf(websiteUrl: string): Promise<plugins.tsclass.business.IPdf> {
2019-05-29 00:27:43 +02:00
const page = await this.headlessBrowser.newPage();
try {
await page.setViewport({
width: 1920,
height: 1200,
});
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
// Use both document.body and document.documentElement to ensure we have a valid height and width.
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: Math.max(
document.body.scrollHeight,
document.documentElement.scrollHeight
) || 1200,
documentWidth: Math.max(
document.body.clientWidth,
document.documentElement.clientWidth
) || 1920,
};
});
// Update viewport height to the full document height.
await page.setViewport({
width: 1920,
height: documentHeight,
});
const pdfBuffer = await page.pdf({
height: documentHeight,
width: 1920,
printBackground: true,
displayHeaderFooter: false,
scale: 1,
pageRanges: '1',
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
2019-05-29 00:27:43 +02:00
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: nodePdfBuffer,
2019-05-29 00:27:43 +02:00
};
} catch (err) {
await page.close().catch(() => {});
throw err;
}
2019-05-29 00:27:43 +02:00
}
2021-10-14 10:59:45 +02:00
2024-04-25 18:48:08 +02:00
public async mergePdfs(inputPdfBuffers: Uint8Array[]): Promise<Uint8Array> {
const mergedPdf = await plugins.pdfLib.PDFDocument.create();
for (const pdfBytes of inputPdfBuffers) {
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
const pages = await mergedPdf.copyPages(pdfDoc, pdfDoc.getPageIndices());
pages.forEach((page) => mergedPdf.addPage(page));
2022-06-15 22:14:55 +02:00
}
2024-04-25 18:48:08 +02:00
const mergedPdfBytes = await mergedPdf.save();
return mergedPdfBytes;
2022-06-15 22:14:55 +02:00
}
public async readFileToPdfObject(pathArg: string): Promise<plugins.tsclass.business.IPdf> {
const absolutePath = plugins.smartpath.transform.makeAbsolute(pathArg);
const parsedPath = plugins.path.parse(absolutePath);
const smartfsInstance = new plugins.smartfs.SmartFs(new plugins.smartfs.SmartFsProviderNode());
const fileContent = await smartfsInstance.file(absolutePath).read();
const buffer = Buffer.from(fileContent);
2022-06-15 22:14:55 +02:00
return {
name: parsedPath.base,
buffer,
id: null,
2022-10-26 23:04:59 +02:00
metadata: null,
};
2021-10-14 10:59:45 +02:00
}
2022-01-05 17:20:28 +01:00
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
2022-01-05 17:20:28 +01:00
const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {
2022-06-15 22:14:55 +02:00
let finalText = '';
2022-01-05 17:20:28 +01:00
for (const page of pdfData.Pages) {
2022-06-15 22:14:55 +02:00
for (const text of page.Texts) {
2022-01-05 17:20:28 +01:00
for (const letter of text.R) {
finalText = finalText + letter.T;
}
2022-06-15 22:14:55 +02:00
}
2022-01-05 17:20:28 +01:00
}
deferred.resolve(finalText);
});
pdfParser.parseBuffer(pdfBufferArg);
return deferred.promise;
}
2024-04-25 18:48:08 +02:00
/**
* Converts a PDF to PNG bytes for each page using Puppeteer and PDF.js.
* This method creates a temporary HTML page that loads PDF.js from a CDN,
* renders each PDF page to a canvas, and then screenshots each canvas element.
*/
2024-04-27 12:07:16 +02:00
public async convertPDFToPngBytes(
2024-04-25 18:48:08 +02:00
pdfBytes: Uint8Array,
options: {
scale?: number; // Scale factor for output size (default: 3.0 for 216 DPI)
maxWidth?: number; // Maximum width in pixels (optional)
maxHeight?: number; // Maximum height in pixels (optional)
} = {}
): Promise<Uint8Array[]> {
// Set default scale for higher quality output (3.0 = ~216 DPI)
const scale = options.scale || 3.0;
2024-04-25 18:48:08 +02:00
// Create a new page using the headless browser.
const page = await this.headlessBrowser.newPage();
2024-04-25 18:48:08 +02:00
try {
// Prepare PDF data as a base64 string.
const base64Pdf: string = Buffer.from(pdfBytes).toString('base64');
// HTML template that loads PDF.js and renders the PDF.
const htmlTemplate: string = `
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>PDF to PNG Converter</title>
<style>
body { margin: 0; }
canvas { display: block; margin: 10px auto; }
</style>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.min.js"></script>
</head>
<body>
<script>
(async function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
const pdfData = "__PDF_DATA__";
const raw = atob(pdfData);
const pdfArray = new Uint8Array([...raw].map(c => c.charCodeAt(0)));
const loadingTask = pdfjsLib.getDocument({data: pdfArray});
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
// Apply scale factor to viewport
const viewport = page.getViewport({ scale: ${scale} });
// Apply max width/height constraints if specified
let finalScale = ${scale};
${options.maxWidth ? `
if (viewport.width > ${options.maxWidth}) {
finalScale = ${options.maxWidth} / (viewport.width / ${scale});
}` : ''}
${options.maxHeight ? `
if (viewport.height > ${options.maxHeight}) {
const heightScale = ${options.maxHeight} / (viewport.height / ${scale});
finalScale = Math.min(finalScale, heightScale);
}` : ''}
// Get final viewport with adjusted scale
const finalViewport = page.getViewport({ scale: finalScale });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.width = finalViewport.width;
canvas.height = finalViewport.height;
canvas.setAttribute('data-page', pageNum);
await page.render({ canvasContext: context, viewport: finalViewport }).promise;
document.body.appendChild(canvas);
}
window.renderComplete = true;
})();
</script>
</body>
</html>
`;
// Replace the placeholder with the actual base64 PDF data.
const htmlContent: string = htmlTemplate.replace("__PDF_DATA__", base64Pdf);
// Set the page content.
await page.setContent(htmlContent, { waitUntil: 'networkidle0' });
// Wait until the PDF.js rendering is complete.
await page.waitForFunction(() => (window as any).renderComplete === true, { timeout: 30000 });
// Query all canvas elements (each representing a rendered PDF page).
const canvasElements = await page.$$('canvas');
const pngBuffers: Uint8Array[] = [];
for (const canvasElement of canvasElements) {
// Screenshot the canvas element. The screenshot will be a PNG buffer.
const screenshotBuffer = (await canvasElement.screenshot({ encoding: 'binary' })) as Buffer;
pngBuffers.push(new Uint8Array(screenshotBuffer));
}
2024-04-25 18:48:08 +02:00
await page.close();
return pngBuffers;
} catch (err) {
await page.close().catch(() => {});
throw err;
}
2024-04-25 18:48:08 +02:00
}
/**
* Converts a PDF to WebP bytes for each page.
* This method creates web-optimized images using WebP format.
* WebP provides 25-35% better compression than JPEG/PNG while maintaining quality.
*/
public async convertPDFToWebpBytes(
pdfBytes: Uint8Array,
options: {
scale?: number; // Scale factor for preview size (default: 3.0 for 216 DPI)
quality?: number; // WebP quality 0-100 (default: 85)
maxWidth?: number; // Maximum width in pixels (optional)
maxHeight?: number; // Maximum height in pixels (optional)
} = {}
): Promise<Uint8Array[]> {
// Set default options for higher quality output (3.0 = ~216 DPI)
const scale = options.scale || 3.0;
const quality = options.quality || 85;
// Create a new page using the headless browser
const page = await this.headlessBrowser.newPage();
try {
// Prepare PDF data as a base64 string
const base64Pdf: string = Buffer.from(pdfBytes).toString('base64');
// HTML template that loads PDF.js and renders the PDF with scaling
const htmlTemplate: string = `
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>PDF to WebP Preview Converter</title>
<style>
body { margin: 0; }
canvas { display: block; margin: 10px auto; }
</style>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.min.js"></script>
</head>
<body>
<script>
(async function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
const pdfData = "__PDF_DATA__";
const raw = atob(pdfData);
const pdfArray = new Uint8Array([...raw].map(c => c.charCodeAt(0)));
const loadingTask = pdfjsLib.getDocument({data: pdfArray});
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
// Apply scale factor to viewport
const viewport = page.getViewport({ scale: ${scale} });
// Apply max width/height constraints if specified
let finalScale = ${scale};
${options.maxWidth ? `
if (viewport.width > ${options.maxWidth}) {
finalScale = ${options.maxWidth} / (viewport.width / ${scale});
}` : ''}
${options.maxHeight ? `
if (viewport.height > ${options.maxHeight}) {
const heightScale = ${options.maxHeight} / (viewport.height / ${scale});
finalScale = Math.min(finalScale, heightScale);
}` : ''}
// Get final viewport with adjusted scale
const finalViewport = page.getViewport({ scale: finalScale });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.width = finalViewport.width;
canvas.height = finalViewport.height;
canvas.setAttribute('data-page', pageNum);
await page.render({ canvasContext: context, viewport: finalViewport }).promise;
document.body.appendChild(canvas);
}
window.renderComplete = true;
})();
</script>
</body>
</html>
`;
// Replace the placeholder with the actual base64 PDF data
const htmlContent: string = htmlTemplate.replace("__PDF_DATA__", base64Pdf);
// Set the page content
await page.setContent(htmlContent, { waitUntil: 'networkidle0' });
// Wait until the PDF.js rendering is complete
await page.waitForFunction(() => (window as any).renderComplete === true, { timeout: 30000 });
// Query all canvas elements (each representing a rendered PDF page)
const canvasElements = await page.$$('canvas');
const webpBuffers: Uint8Array[] = [];
for (const canvasElement of canvasElements) {
// Screenshot the canvas element as WebP
const screenshotBuffer = (await canvasElement.screenshot({
type: 'webp',
quality: quality,
encoding: 'binary'
})) as Buffer;
webpBuffers.push(new Uint8Array(screenshotBuffer));
}
await page.close();
return webpBuffers;
} catch (err) {
await page.close().catch(() => {});
throw err;
}
}
/**
* Converts a PDF to progressive JPEG bytes for each page.
* This method creates progressive JPEG images that load in multiple passes,
* showing a low-quality preview first, then progressively improving.
* Uses SmartJimp for true progressive JPEG encoding.
*/
public async convertPDFToJpegBytes(
pdfBytes: Uint8Array,
options: {
scale?: number; // Scale factor for output size (default: 3.0 for 216 DPI)
quality?: number; // JPEG quality 0-100 (default: 85)
maxWidth?: number; // Maximum width in pixels (optional)
maxHeight?: number; // Maximum height in pixels (optional)
} = {}
): Promise<Uint8Array[]> {
// First, convert PDF to PNG using our existing method
const pngBuffers = await this.convertPDFToPngBytes(pdfBytes, {
scale: options.scale,
maxWidth: options.maxWidth,
maxHeight: options.maxHeight
});
// Initialize SmartJimp in sharp mode for progressive JPEG support
const smartJimpInstance = new plugins.smartjimp.SmartJimp({ mode: 'sharp' });
// Convert each PNG to progressive JPEG
const jpegBuffers: Uint8Array[] = [];
const quality = options.quality || 85;
for (const pngBuffer of pngBuffers) {
// Convert PNG buffer to progressive JPEG
const jpegBuffer = await smartJimpInstance.computeAssetVariation(
Buffer.from(pngBuffer),
{
format: 'jpeg',
progressive: true,
quality
}
);
jpegBuffers.push(new Uint8Array(jpegBuffer));
}
return jpegBuffers;
}
}