fix(SmartPdf): Fix buffer handling for PDF conversion and text extraction

This commit is contained in:
Philipp Kunz 2025-02-25 18:22:06 +00:00
parent 29d3cbb0b6
commit 9908897aa2
3 changed files with 32 additions and 11 deletions

View File

@ -1,5 +1,12 @@
# Changelog
## 2025-02-25 - 3.2.2 - fix(SmartPdf)
Fix buffer handling for PDF conversion and text extraction
- Ensure Uint8Array is converted to Node Buffer for PDF conversion.
- Correct the PDF page viewport handling by using document dimensions.
- Fix extractTextFromPdfBuffer argument type from Uint8Array to Buffer.
## 2025-02-25 - 3.2.1 - fix(SmartPdf)
Fix type for extractTextFromPdfBuffer function

View File

@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@push.rocks/smartpdf',
version: '3.2.1',
version: '3.2.2',
description: 'A library for creating PDFs dynamically from HTML or websites with additional features like merging PDFs.'
}

View File

@ -35,7 +35,7 @@ export class SmartPdf {
this.externalBrowserBool = true;
} else {
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
forceNoSandbox: true,
forceNoSandbox: false,
});
}
@ -104,6 +104,8 @@ export class SmartPdf {
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve();
@ -112,9 +114,9 @@ export class SmartPdf {
id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -139,14 +141,16 @@ export class SmartPdf {
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -159,12 +163,20 @@ export class SmartPdf {
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
// Use both document.body and document.documentElement to ensure we have a valid height and width.
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: document.body.scrollHeight,
documentWidth: document.body.clientWidth,
documentHeight: Math.max(
document.body.scrollHeight,
document.documentElement.scrollHeight
) || 1200,
documentWidth: Math.max(
document.body.clientWidth,
document.documentElement.clientWidth
) || 1920,
};
});
// Update viewport height to the full document height.
await page.setViewport({
width: 1920,
height: documentHeight,
@ -177,14 +189,16 @@ export class SmartPdf {
scale: 1,
pageRanges: '1',
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -212,7 +226,7 @@ export class SmartPdf {
};
}
public async extractTextFromPdfBuffer(pdfBufferArg: Uint8Array): Promise<string> {
public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => {