fix(SmartPdf): Fix buffer handling for PDF conversion and text extraction

This commit is contained in:
Philipp Kunz 2025-02-25 18:22:06 +00:00
parent 29d3cbb0b6
commit 9908897aa2
3 changed files with 32 additions and 11 deletions

View File

@ -1,5 +1,12 @@
# Changelog # Changelog
## 2025-02-25 - 3.2.2 - fix(SmartPdf)
Fix buffer handling for PDF conversion and text extraction
- Ensure Uint8Array is converted to Node Buffer for PDF conversion.
- Correct the PDF page viewport handling by using document dimensions.
- Fix extractTextFromPdfBuffer argument type from Uint8Array to Buffer.
## 2025-02-25 - 3.2.1 - fix(SmartPdf) ## 2025-02-25 - 3.2.1 - fix(SmartPdf)
Fix type for extractTextFromPdfBuffer function Fix type for extractTextFromPdfBuffer function

View File

@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: '@push.rocks/smartpdf', name: '@push.rocks/smartpdf',
version: '3.2.1', version: '3.2.2',
description: 'A library for creating PDFs dynamically from HTML or websites with additional features like merging PDFs.' description: 'A library for creating PDFs dynamically from HTML or websites with additional features like merging PDFs.'
} }

View File

@ -35,7 +35,7 @@ export class SmartPdf {
this.externalBrowserBool = true; this.externalBrowserBool = true;
} else { } else {
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({ this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
forceNoSandbox: true, forceNoSandbox: false,
}); });
} }
@ -104,6 +104,8 @@ export class SmartPdf {
printBackground: true, printBackground: true,
displayHeaderFooter: false, displayHeaderFooter: false,
}); });
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close(); await page.close();
delete this._candidates[pdfCandidate.pdfId]; delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve(); pdfCandidate.doneDeferred.resolve();
@ -112,9 +114,9 @@ export class SmartPdf {
id: pdfCandidate.pdfId, id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`, name: `${pdfCandidate.pdfId}.js`,
metadata: { metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
}, },
buffer: pdfBuffer, buffer: nodePdfBuffer,
}; };
} }
@ -139,14 +141,16 @@ export class SmartPdf {
printBackground: true, printBackground: true,
displayHeaderFooter: false, displayHeaderFooter: false,
}); });
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close(); await page.close();
return { return {
id: pdfId, id: pdfId,
name: `${pdfId}.js`, name: `${pdfId}.js`,
metadata: { metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
}, },
buffer: pdfBuffer, buffer: nodePdfBuffer,
}; };
} }
@ -159,12 +163,20 @@ export class SmartPdf {
await page.emulateMediaType('screen'); await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' }); const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId(); const pdfId = plugins.smartunique.shortId();
// Use both document.body and document.documentElement to ensure we have a valid height and width.
const { documentHeight, documentWidth } = await page.evaluate(() => { const { documentHeight, documentWidth } = await page.evaluate(() => {
return { return {
documentHeight: document.body.scrollHeight, documentHeight: Math.max(
documentWidth: document.body.clientWidth, document.body.scrollHeight,
document.documentElement.scrollHeight
) || 1200,
documentWidth: Math.max(
document.body.clientWidth,
document.documentElement.clientWidth
) || 1920,
}; };
}); });
// Update viewport height to the full document height.
await page.setViewport({ await page.setViewport({
width: 1920, width: 1920,
height: documentHeight, height: documentHeight,
@ -177,14 +189,16 @@ export class SmartPdf {
scale: 1, scale: 1,
pageRanges: '1', pageRanges: '1',
}); });
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close(); await page.close();
return { return {
id: pdfId, id: pdfId,
name: `${pdfId}.js`, name: `${pdfId}.js`,
metadata: { metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer), textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
}, },
buffer: pdfBuffer, buffer: nodePdfBuffer,
}; };
} }
@ -212,7 +226,7 @@ export class SmartPdf {
}; };
} }
public async extractTextFromPdfBuffer(pdfBufferArg: Uint8Array): Promise<string> { public async extractTextFromPdfBuffer(pdfBufferArg: Buffer): Promise<string> {
const deferred = plugins.smartpromise.defer<string>(); const deferred = plugins.smartpromise.defer<string>();
const pdfParser: any = new plugins.pdf2json(); const pdfParser: any = new plugins.pdf2json();
pdfParser.on('pdfParser_dataReady', (pdfData: any) => { pdfParser.on('pdfParser_dataReady', (pdfData: any) => {