Compare commits

...

19 Commits

8 changed files with 9427 additions and 3316 deletions

View File

@ -119,6 +119,6 @@ jobs:
run: |
npmci node install stable
npmci npm install
pnpm install -g @gitzone/tsdoc
pnpm install -g @git.zone/tsdoc
npmci command tsdoc
continue-on-error: true

131
changelog.md Normal file
View File

@ -0,0 +1,131 @@
# Changelog
## 2025-02-25 - 3.2.2 - fix(SmartPdf)
Fix buffer handling for PDF conversion and text extraction
- Ensure Uint8Array is converted to Node Buffer for PDF conversion.
- Correct the PDF page viewport handling by using document dimensions.
- Fix extractTextFromPdfBuffer argument type from Uint8Array to Buffer.
## 2025-02-25 - 3.2.1 - fix(SmartPdf)
Fix type for extractTextFromPdfBuffer function
- Corrected the parameter type from Buffer to Uint8Array for extractTextFromPdfBuffer function.
## 2025-02-25 - 3.2.0 - feat(smartpdf)
Improve dependency versions and optimize PDF to PNG conversion.
- Update several dependencies to newer versions for better stability and performance.
- Refactor tests to enhance readability and add directory creation validations.
- Optimize PDF to PNG conversion by switching to a more efficient Puppeteer and PDF.js-based method.
- Add checks for presence of required dependencies (GraphicsMagick and Ghostscript).
- Fix media emulation issue by properly awaiting the emulateMediaType function.
## 2024-11-30 - 3.1.8 - fix(core)
Fix candidate handling in PDF generation
- Added error handling for missing PDF candidates in server requests.
- Updated devDependencies and dependencies to latest versions for better stability and new features.
- Patched header retrieval logic during PDF generation for security check.
## 2024-09-27 - 3.1.7 - fix(dependencies)
Update dependencies to latest versions
- Updated @git.zone/tsbuild to version ^2.1.84
- Updated @git.zone/tsdoc to version ^1.3.12
- Updated @git.zone/tsrun to version ^1.2.49
- Updated @push.rocks/tapbundle to version ^5.3.0
- Updated @types/node to version ^22.7.4
- Updated @push.rocks/smartfile to version ^11.0.21
- Updated @push.rocks/smartpromise to version ^4.0.4
- Updated @tsclass/tsclass to version ^4.1.2
- Updated express to version ^4.21.0
- Updated pdf2pic to version ^3.1.3
## 2024-05-29 - 3.1.6 - Core
Updated description
- Minor changes to documentation and internal text.
## 2024-04-25 to 2024-04-30 - 3.1.0 to 3.1.5 - Core
Fix updates in core functionality
- Fixes and updates in core function in versions 3.1.0 to 3.1.5.
## 2024-04-25 - 3.0.17 - Feature
Now supports PDF to JPG conversion
- Added support for converting PDF files to JPG format.
## 2024-03-19 to 2024-04-14 - 3.0.17 - Maintenance
Various updates to project configuration files
- Updated `tsconfig`.
- Updated `npmextra.json`.
## 2023-07-11 to 2024-03-19 - 3.0.15 to 3.0.16 - Organization
Switch to new organization scheme and core updates
- Switched to new organization scheme.
- Applied core updates and bug fixes.
## 2022-11-07 to 2023-07-10 - 3.0.13 to 3.0.14 - Core
Fixes and updates to core functionality
- Various minor bug fixes and updates to core components.
## 2022-09-13 to 2022-11-07 - 3.0.10 to 3.0.12 - Core
Ongoing core updates and maintenance
- Regular fixes and operational improvements in core functionalities.
## 2022-06-12 to 2022-09-13 - 3.0.7 to 3.0.9 - Core
Continued focus on high-priority bug fixes and core functionalities
- Regular fixes for critical bugs and enhancements.
## 2022-03-24 to 2022-06-29 - 3.0.3 to 3.0.6 - Core
Further optimization and maintenance releases
- Further improvements and refinements of issues in core functionalities.
## 2022-01-05 to 2022-03-25 - 3.0.0 to 3.0.2 - Major Version Release
Major release for version 3.0.x, including core fixes
- Increased version from 2.x to 3.0. New significant changes and fixes.
## 2022-01-05 to 2022-03-24 - 2.0.13 to 2.0.19 - Core
Routine core updates and bug fixes
- Regular bug fixes in core components.
## 2019-11-19 to 2022-01-06 - 2.0.0 to 2.0.11 - Core
Multiple core updates and a few performance improvements
- Some performance enhancements and multiple bug fixes.
## 2019-11-16 to 2019-11-19 - 1.0.27 to 1.0.29 - API
Breaking change in API
- Naming PDF results to better represent their content.
## 2019-05-29 to 2019-11-15 - 1.0.13 to 1.0.26 - Core
Core functional updates and some major restructuring
- Introduced multiple updates to the core, addressing bugs and improving stability.
## 2019-04-10 to 2019-05-28 - 1.0.4 to 1.0.12 - Core
Fixes and updates in the core
- Implementation of multiple essential fixes for core components.
## 2018-10-06 - 1.0.1 to 1.0.3 - Core and Typings
Initial implementation and core fixes
- Initial implementation of the project.
- Fixed compilation problems in typings.
## 2016-01-29 - unknown - Initial
Initial commit
- Initial commit for the project setup.

View File

@ -1,6 +1,6 @@
{
"name": "@push.rocks/smartpdf",
"version": "3.1.2",
"version": "3.2.2",
"private": false,
"description": "A library for creating PDFs dynamically from HTML or websites with additional features like merging PDFs.",
"main": "dist_ts/index.js",
@ -14,28 +14,27 @@
"buildDocs": "tsdoc"
},
"devDependencies": {
"@gitzone/tsbuild": "^2.1.66",
"@gitzone/tsdoc": "^1.1.12",
"@gitzone/tsrun": "^1.2.44",
"@gitzone/tstest": "^1.0.77",
"@push.rocks/tapbundle": "^5.0.23",
"@types/node": "^20.12.7"
"@git.zone/tsbuild": "^2.2.1",
"@git.zone/tsdoc": "^1.4.3",
"@git.zone/tsrun": "^1.3.3",
"@git.zone/tstest": "^1.0.96",
"@push.rocks/tapbundle": "^5.5.6",
"@types/node": "^22.13.5"
},
"dependencies": {
"@push.rocks/smartbuffer": "^3.0.4",
"@push.rocks/smartdelay": "^3.0.5",
"@push.rocks/smartfile": "^11.0.14",
"@push.rocks/smartfile": "^11.2.0",
"@push.rocks/smartnetwork": "^3.0.0",
"@push.rocks/smartpath": "^5.0.18",
"@push.rocks/smartpromise": "^4.0.3",
"@push.rocks/smartpuppeteer": "^2.0.2",
"@push.rocks/smartpromise": "^4.2.3",
"@push.rocks/smartpuppeteer": "^2.0.5",
"@push.rocks/smartunique": "^3.0.9",
"@tsclass/tsclass": "^4.0.54",
"@types/express": "^4.17.21",
"express": "^4.19.2",
"@tsclass/tsclass": "^4.4.0",
"@types/express": "^5.0.0",
"express": "^4.21.2",
"pdf-lib": "^1.17.1",
"pdf2json": "^3.0.5",
"pdf2pic": "^3.1.1"
"pdf2json": "3.1.5"
},
"files": [
"ts/**/*",
@ -65,5 +64,10 @@
"PDF merging",
"text extraction",
"PDF management"
]
],
"homepage": "https://code.foss.global/push.rocks/smartpdf",
"repository": {
"type": "git",
"url": "https://code.foss.global/push.rocks/smartpdf.git"
}
}

12298
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -1,66 +1,85 @@
import { expect, tap } from '@push.rocks/tapbundle';
import * as smartpdf from '../ts/index.js';
import * as fs from 'fs';
import * as path from 'path';
let testSmartPdf: smartpdf.SmartPdf;
tap.test('should create a valid instance of smartpdf', async () => {
/**
* Ensures that a directory exists.
* @param dirPath - The directory path to ensure.
*/
function ensureDir(dirPath: string): void {
if (!fs.existsSync(dirPath)) {
fs.mkdirSync(dirPath, { recursive: true });
}
}
tap.test('should create a valid instance of SmartPdf', async () => {
testSmartPdf = new smartpdf.SmartPdf();
expect(testSmartPdf).toBeInstanceOf(smartpdf.SmartPdf);
});
tap.test('should start the instance', async () => {
tap.test('should start the SmartPdf instance', async () => {
await testSmartPdf.start();
});
tap.test('should create a pdf from html string', async () => {
await testSmartPdf.getA4PdfResultForHtmlString('hi');
tap.test('should create PDFs from HTML string', async () => {
const pdf1 = await testSmartPdf.getA4PdfResultForHtmlString('hi');
const pdf2 = await testSmartPdf.getA4PdfResultForHtmlString('hello');
expect(pdf1.buffer).toBeInstanceOf(Buffer);
expect(pdf2.buffer).toBeInstanceOf(Buffer);
});
tap.test('should create a pdf from html string', async () => {
await testSmartPdf.getA4PdfResultForHtmlString('hi');
tap.test('should create PDFs from websites', async () => {
const pdfA4 = await testSmartPdf.getPdfResultForWebsite('https://www.wikipedia.org');
const pdfSingle = await testSmartPdf.getFullWebsiteAsSinglePdf('https://www.wikipedia.org');
expect(pdfA4.buffer).toBeInstanceOf(Buffer);
expect(pdfSingle.buffer).toBeInstanceOf(Buffer);
});
tap.test('should create a pdf from website as A4', async () => {
await testSmartPdf.getPdfResultForWebsite('https://www.wikipedia.org');
});
tap.test('should create a pdf from website as single page PDF', async () => {
await testSmartPdf.getFullWebsiteAsSinglePdf('https://www.wikipedia.org');
});
tap.test('should create a valid PDFResult', async () => {
const writePDfToDisk = async (urlArg: string, fileName: string) => {
tap.test('should create valid PDF results and write them to disk', async () => {
const writePdfToDisk = async (urlArg: string, fileName: string) => {
const pdfResult = await testSmartPdf.getFullWebsiteAsSinglePdf(urlArg);
expect(pdfResult.buffer).toBeInstanceOf(Buffer);
const fs = await import('fs');
if (!fs.existsSync('.nogit/')) {
fs.mkdirSync('.nogit/');
}
fs.writeFileSync(`.nogit/${fileName}`, pdfResult.buffer as Buffer);
ensureDir('.nogit');
fs.writeFileSync(path.join('.nogit', fileName), pdfResult.buffer as Buffer);
};
await writePDfToDisk('https://lossless.com/', '1.pdf');
await writePDfToDisk('https://layer.io', '2.pdf');
await writePdfToDisk('https://lossless.com/', '1.pdf');
await writePdfToDisk('https://layer.io', '2.pdf');
});
tap.test('should merge pdfs', async () => {
const fs = await import('fs');
tap.test('should merge PDFs into a combined PDF', async () => {
const pdf1 = await testSmartPdf.readFileToPdfObject('.nogit/1.pdf');
const pdf2 = await testSmartPdf.readFileToPdfObject('.nogit/2.pdf');
fs.writeFileSync(
`.nogit/combined.pdf`,
await testSmartPdf.mergePdfs([pdf1.buffer, pdf2.buffer])
);
const mergedBuffer = await testSmartPdf.mergePdfs([pdf1.buffer, pdf2.buffer]);
ensureDir('.nogit');
fs.writeFileSync(path.join('.nogit', 'combined.pdf'), mergedBuffer);
});
tap.test('should create images from an pdf', async () => {
tap.test('should create PNG images from combined PDF using Puppeteer conversion', async () => {
const pdfObject = await testSmartPdf.readFileToPdfObject('.nogit/combined.pdf');
const images = await testSmartPdf.convertPDFToJPGBytes(pdfObject.buffer);
console.log(images);
const images = await testSmartPdf.convertPDFToPngBytes(pdfObject.buffer);
expect(images.length).toBeGreaterThan(0);
console.log('Puppeteer-based conversion image sizes:', images.map(img => img.length));
});
tap.test('should be able to close properly', async () => {
tap.test('should store PNG results from both conversion functions in .nogit/testresults', async () => {
const testResultsDir = path.join('.nogit', 'testresults');
ensureDir(testResultsDir);
const pdfObject = await testSmartPdf.readFileToPdfObject('.nogit/combined.pdf');
// Convert using Puppeteer-based function and store images
const imagesPuppeteer = await testSmartPdf.convertPDFToPngBytes(pdfObject.buffer);
imagesPuppeteer.forEach((img, index) => {
const filePath = path.join(testResultsDir, `puppeteer_method_page_${index + 1}.png`);
fs.writeFileSync(filePath, Buffer.from(img));
});
});
tap.test('should close the SmartPdf instance properly', async () => {
await testSmartPdf.stop();
});
tap.start();
tap.start();

View File

@ -1,8 +1,8 @@
/**
* autocreated commitinfo by @pushrocks/commitinfo
* autocreated commitinfo by @push.rocks/commitinfo
*/
export const commitinfo = {
name: '@push.rocks/smartpdf',
version: '3.1.2',
version: '3.2.2',
description: 'A library for creating PDFs dynamically from HTML or websites with additional features like merging PDFs.'
}

View File

@ -3,6 +3,7 @@ import * as paths from './smartpdf.paths.js';
import { Server } from 'http';
import { PdfCandidate } from './smartpdf.classes.pdfcandidate.js';
import { type IPdf } from '@tsclass/tsclass/dist_ts/business/pdf.js';
import { execFile } from 'child_process';
declare const document: any;
@ -34,15 +35,20 @@ export class SmartPdf {
this.externalBrowserBool = true;
} else {
this.headlessBrowser = await plugins.smartpuppeteer.getEnvAwareBrowserInstance({
forceNoSandbox: true,
forceNoSandbox: false,
});
}
// setup server
const app = plugins.express();
app.get('/:pdfId', (req, res) => {
res.setHeader('PDF-ID', this._candidates[req.params.pdfId].pdfId);
res.send(this._candidates[req.params.pdfId].htmlString);
const wantedCandidate = this._candidates[req.params.pdfId];
if (!wantedCandidate) {
console.log(`${req.url} not attached to a candidate`);
return;
}
res.setHeader('pdf-id', wantedCandidate.pdfId);
res.send(wantedCandidate.htmlString);
});
this.htmlServerInstance = plugins.http.createServer(app);
const smartnetworkInstance = new plugins.smartnetwork.SmartNetwork();
@ -70,7 +76,7 @@ export class SmartPdf {
}
/**
* returns a pdf for a given html string;
* Returns a PDF for a given HTML string.
*/
async getA4PdfResultForHtmlString(htmlStringArg: string): Promise<plugins.tsclass.business.IPdf> {
await this._readyDeferred.promise;
@ -84,7 +90,6 @@ export class SmartPdf {
const response = await page.goto(`http://localhost:3210/${pdfCandidate.pdfId}`, {
waitUntil: 'networkidle2',
});
// await plugins.smartdelay.delayFor(1000);
const headers = response.headers();
if (headers['pdf-id'] !== pdfCandidate.pdfId) {
console.log('Error! Headers do not match. For security reasons no pdf is being emitted!');
@ -99,6 +104,8 @@ export class SmartPdf {
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
delete this._candidates[pdfCandidate.pdfId];
pdfCandidate.doneDeferred.resolve();
@ -107,9 +114,9 @@ export class SmartPdf {
id: pdfCandidate.pdfId,
name: `${pdfCandidate.pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -134,14 +141,16 @@ export class SmartPdf {
printBackground: true,
displayHeaderFooter: false,
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -151,15 +160,23 @@ export class SmartPdf {
width: 1920,
height: 1200,
});
page.emulateMediaType('screen');
await page.emulateMediaType('screen');
const response = await page.goto(websiteUrl, { waitUntil: 'networkidle2' });
const pdfId = plugins.smartunique.shortId();
// Use both document.body and document.documentElement to ensure we have a valid height and width.
const { documentHeight, documentWidth } = await page.evaluate(() => {
return {
documentHeight: document.body.scrollHeight,
documentWidth: document.body.clientWidth,
documentHeight: Math.max(
document.body.scrollHeight,
document.documentElement.scrollHeight
) || 1200,
documentWidth: Math.max(
document.body.clientWidth,
document.documentElement.clientWidth
) || 1920,
};
});
// Update viewport height to the full document height.
await page.setViewport({
width: 1920,
height: documentHeight,
@ -172,14 +189,16 @@ export class SmartPdf {
scale: 1,
pageRanges: '1',
});
// Convert Uint8Array to Node Buffer
const nodePdfBuffer = Buffer.from(pdfBuffer);
await page.close();
return {
id: pdfId,
name: `${pdfId}.js`,
metadata: {
textExtraction: await this.extractTextFromPdfBuffer(pdfBuffer),
textExtraction: await this.extractTextFromPdfBuffer(nodePdfBuffer),
},
buffer: pdfBuffer,
buffer: nodePdfBuffer,
};
}
@ -196,9 +215,9 @@ export class SmartPdf {
}
public async readFileToPdfObject(pathArg: string): Promise<plugins.tsclass.business.IPdf> {
const path = plugins.smartpath.transform.makeAbsolute(pathArg);
const parsedPath = plugins.path.parse(path);
const buffer = await plugins.smartfile.fs.toBuffer(path);
const absolutePath = plugins.smartpath.transform.makeAbsolute(pathArg);
const parsedPath = plugins.path.parse(absolutePath);
const buffer = await plugins.smartfile.fs.toBuffer(absolutePath);
return {
name: parsedPath.base,
buffer,
@ -225,38 +244,109 @@ export class SmartPdf {
return deferred.promise;
}
public async convertPDFToJPGBytes(
pdfBytes: Uint8Array,
options: {
width?: number;
height?: number;
quality?: number;
} = {}
): Promise<Uint8Array[]> {
const { width = 1024, height = 768, quality = 100 } = options;
/**
* Checks for the presence of required dependencies: GraphicsMagick and Ghostscript.
*/
private async checkDependencies(): Promise<void> {
await Promise.all([
this.checkCommandExists('gm', ['version']),
this.checkCommandExists('gs', ['--version']),
]);
}
// Load the PDF document
const pdfDoc = await plugins.pdfLib.PDFDocument.load(pdfBytes);
const converter = plugins.pdf2pic.fromBuffer(Buffer.from(pdfBytes), {
density: 100, // Image density (DPI)
format: 'jpg', // Image format
width, // Output image width
height, // Output image height
quality, // Output image quality
/**
* Checks if a given command exists by trying to execute it.
*/
private checkCommandExists(command: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => {
execFile(command, args, (error, stdout, stderr) => {
if (error) {
reject(new Error(`Dependency check failed: ${command} is not installed or not in the PATH. ${error.message}`));
} else {
resolve();
}
});
});
}
// Get array promises that resolve to JPG buffers
const imagePromises: Promise<Buffer>[] = [];
const numPages = pdfDoc.getPageCount();
/**
* Converts a PDF to PNG bytes for each page using Puppeteer and PDF.js.
* This method creates a temporary HTML page that loads PDF.js from a CDN,
* renders each PDF page to a canvas, and then screenshots each canvas element.
*/
public async convertPDFToPngBytes(
pdfBytes: Uint8Array,
options: { width?: number; height?: number; quality?: number } = {}
): Promise<Uint8Array[]> {
// Note: options.width, options.height, and options.quality are not applied here,
// as the rendered canvas size is determined by the PDF page dimensions.
for (let i = 0; i < numPages; i++) {
imagePromises.push(converter(i + 1, {
responseType: 'buffer',
}).then((output) => output.buffer));
// Create a new page using the headless browser.
const page = await this.headlessBrowser.newPage();
// Prepare PDF data as a base64 string.
const base64Pdf: string = Buffer.from(pdfBytes).toString('base64');
// HTML template that loads PDF.js and renders the PDF.
const htmlTemplate: string = `
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>PDF to PNG Converter</title>
<style>
body { margin: 0; }
canvas { display: block; margin: 10px auto; }
</style>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.min.js"></script>
</head>
<body>
<script>
(async function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
const pdfData = "__PDF_DATA__";
const raw = atob(pdfData);
const pdfArray = new Uint8Array([...raw].map(c => c.charCodeAt(0)));
const loadingTask = pdfjsLib.getDocument({data: pdfArray});
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0 });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
await page.render({ canvasContext: context, viewport: viewport }).promise;
document.body.appendChild(canvas);
}
window.renderComplete = true;
})();
</script>
</body>
</html>
`;
// Replace the placeholder with the actual base64 PDF data.
const htmlContent: string = htmlTemplate.replace("__PDF_DATA__", base64Pdf);
// Set the page content.
await page.setContent(htmlContent, { waitUntil: 'networkidle0' });
// Wait until the PDF.js rendering is complete.
await page.waitForFunction(() => (window as any).renderComplete === true, { timeout: 30000 });
// Query all canvas elements (each representing a rendered PDF page).
const canvasElements = await page.$$('canvas');
const pngBuffers: Uint8Array[] = [];
for (const canvasElement of canvasElements) {
// Screenshot the canvas element. The screenshot will be a PNG buffer.
const screenshotBuffer = (await canvasElement.screenshot({ encoding: 'binary' })) as Buffer;
pngBuffers.push(new Uint8Array(screenshotBuffer));
}
// Resolve all promises and return the array of buffers
return Promise.all(imagePromises);
await page.close();
return pngBuffers;
}
}
}

View File

@ -33,7 +33,6 @@ export { tsclass };
// thirdparty
import express from 'express';
import pdf2json from 'pdf2json';
import pdf2pic from 'pdf2pic';
import pdfLib from 'pdf-lib';
export { express, pdf2json, pdf2pic, pdfLib, };
export { express, pdf2json, pdfLib, };