feat(HandelsRegister): Add file download functionality to HandelsRegister

This commit is contained in:
Philipp Kunz 2025-01-04 02:27:53 +01:00
parent 7c07bc59e4
commit c177193438
7 changed files with 229 additions and 160 deletions

View File

@ -1,5 +1,13 @@
# Changelog # Changelog
## 2025-01-04 - 1.4.0 - feat(HandelsRegister)
Add file download functionality to HandelsRegister
- Implemented file download feature in the HandelsRegister class.
- Configured pages in Puppeteer to allow downloads and set download paths.
- Parsed German registration information with more robust error handling.
- Added specific methods for downloading and handling 'SI' and 'AD' files.
## 2025-01-03 - 1.3.1 - fix(HandelsRegister) ## 2025-01-03 - 1.3.1 - fix(HandelsRegister)
Refined HandelsRegister functionality for better error handling and response capture. Refined HandelsRegister functionality for better error handling and response capture.

View File

@ -22,6 +22,7 @@
"@types/node": "^22.10.4" "@types/node": "^22.10.4"
}, },
"dependencies": { "dependencies": {
"@push.rocks/lik": "^6.1.0",
"@push.rocks/qenv": "^6.1.0", "@push.rocks/qenv": "^6.1.0",
"@push.rocks/smartarchive": "^4.0.39", "@push.rocks/smartarchive": "^4.0.39",
"@push.rocks/smartbrowser": "^2.0.8", "@push.rocks/smartbrowser": "^2.0.8",
@ -32,6 +33,7 @@
"@push.rocks/smartpromise": "^4.0.4", "@push.rocks/smartpromise": "^4.0.4",
"@push.rocks/smartrequest": "^2.0.23", "@push.rocks/smartrequest": "^2.0.23",
"@push.rocks/smartstream": "^3.2.5", "@push.rocks/smartstream": "^3.2.5",
"@push.rocks/smartunique": "^3.0.9",
"@tsclass/tsclass": "^4.2.0" "@tsclass/tsclass": "^4.2.0"
}, },
"repository": { "repository": {

6
pnpm-lock.yaml generated
View File

@ -8,6 +8,9 @@ importers:
.: .:
dependencies: dependencies:
'@push.rocks/lik':
specifier: ^6.1.0
version: 6.1.0
'@push.rocks/qenv': '@push.rocks/qenv':
specifier: ^6.1.0 specifier: ^6.1.0
version: 6.1.0 version: 6.1.0
@ -38,6 +41,9 @@ importers:
'@push.rocks/smartstream': '@push.rocks/smartstream':
specifier: ^3.2.5 specifier: ^3.2.5
version: 3.2.5 version: 3.2.5
'@push.rocks/smartunique':
specifier: ^3.0.9
version: 3.0.9
'@tsclass/tsclass': '@tsclass/tsclass':
specifier: ^4.2.0 specifier: ^4.2.0
version: 4.2.0 version: 4.2.0

View File

@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: '@fin.cx/opendata', name: '@fin.cx/opendata',
version: '1.3.1', version: '1.4.0',
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.' description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
} }

View File

@ -1,13 +1,17 @@
import type { BusinessRecord } from './classes.businessrecord.js'; import type { BusinessRecord } from './classes.businessrecord.js';
import type { OpenData } from './classes.main.opendata.js'; import type { OpenData } from './classes.main.opendata.js';
import * as plugins from './plugins.js'; import * as plugins from './plugins.js';
import * as paths from './paths.js';
/** /**
* the HandlesRegister exposed as a class * the HandlesRegister exposed as a class
*/ */
export class HandelsRegister { export class HandelsRegister {
private openDataRef: OpenData; private openDataRef: OpenData;
private asyncExecutionStack = new plugins.lik.AsyncExecutionStack();
private uniqueDowloadFolder = plugins.path.join(paths.downloadDir, plugins.smartunique.uniSimple());
// Puppeteer wrapper instance
public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser(); public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser();
constructor(openDataRef: OpenData) { constructor(openDataRef: OpenData) {
@ -15,16 +19,34 @@ export class HandelsRegister {
} }
public async start() { public async start() {
// Start the browser
await plugins.smartfile.fs.ensureDir(this.uniqueDowloadFolder);
await this.smartbrowserInstance.start(); await this.smartbrowserInstance.start();
} }
public async stop() { public async stop() {
// Stop the browser
await plugins.smartfile.fs.remove(this.uniqueDowloadFolder);
await this.smartbrowserInstance.stop(); await this.smartbrowserInstance.stop();
} }
// page stuff /**
* Creates a new page and configures it to allow file downloads
* to a predefined path.
*/
public getNewPage = async () => { public getNewPage = async () => {
const page = await this.smartbrowserInstance.headlessBrowser.newPage(); const page = await this.smartbrowserInstance.headlessBrowser.newPage();
// 1) Create a DevTools session for this page
const cdpSession = await page.target().createCDPSession();
// 2) Allow file downloads and set the download path
await cdpSession.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: this.uniqueDowloadFolder, // <-- Change this to your desired absolute path
});
// Optionally set viewport and go to page
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://www.handelsregister.de/'); await page.goto('https://www.handelsregister.de/');
return page; return page;
@ -49,8 +71,13 @@ export class HandelsRegister {
}; };
private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => { private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => {
await pageArg.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', { await pageArg
.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
timeout: 30000, timeout: 30000,
})
.catch(async (err) => {
await pageArg.screenshot({ path: paths.downloadDir + '/error.png' });
throw err;
}); });
const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => { const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => {
@ -110,27 +137,87 @@ export class HandelsRegister {
} }
}; };
// parsing stuff private async downloadFile(
pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page,
typeArg: 'SI' | 'AD'
) {
// Trigger the file download by clicking on the relevant link
await pageArg.evaluate((typeArg2) => {
// Locate the table body
const tableBody = document.querySelector(
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data'
);
if (!tableBody) {
throw new Error('Table body not found');
}
// Locate the first row
const firstRow = tableBody.querySelector('tr:nth-child(1)');
if (!firstRow) {
throw new Error('First row not found');
}
// Locate the last cell in the first row
const lastCell = firstRow.querySelector('td:last-child');
if (!lastCell) {
throw new Error('Last cell not found in the first row');
}
// Locate the download links
const adLink = lastCell.querySelector('a:first-of-type');
const siLink = lastCell.querySelector('a:last-of-type');
if (!siLink) {
throw new Error('SI link not found in the last cell');
}
// Simulate a click on the last <a> element
switch (typeArg2) {
case 'AD':
(adLink as HTMLElement).click();
break;
case 'SI':
(siLink as HTMLElement).click();
break;
default:
throw new Error('Invalid file type');
}
}, typeArg);
// Wait a bit for the download to complete (you might want to implement
// a more robust file-exists check or a wait-for-download library)
await pageArg.waitForTimeout(10000);
const files = await plugins.smartfile.fs.fileTreeToObject(this.uniqueDowloadFolder, '**/*');
await plugins.smartfile.fs.ensureEmptyDir(this.uniqueDowloadFolder);
return files [0];
}
/**
* Helper method to parse the German registration string
*/
private async parseGermanRegistration( private async parseGermanRegistration(
input: string input: string
): Promise<BusinessRecord['data']['germanParsedRegistration']> { ): Promise<BusinessRecord['data']['germanParsedRegistration']> {
// e.g. District court Berlin (Charlottenburg) HRB 123456
const regex = const regex =
/District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u; /District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u;
const match = input.match(regex); const match = input.match(regex);
if (match) { if (match) {
return { return {
court: match[1], // Extracts the court name court: match[1],
type: match[2] as 'HRA' | 'HRB', // Extracts the type and ensures it matches the specified types type: match[2] as 'HRA' | 'HRB', // Adjust if needed
number: match[3], // Extracts the number number: match[3],
}; };
} }
} }
/** /**
* Search for a company by name * Search for a company by name and return basic info
*/ */
public async searchCompany(companyNameArg: string) { public async searchCompany(companyNameArg: string) {
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
const page = await this.getNewPage(); const page = await this.getNewPage();
await this.navigateToPage(page, 'Normal search'); await this.navigateToPage(page, 'Normal search');
@ -138,7 +225,7 @@ export class HandelsRegister {
// Wait for the textarea to appear // Wait for the textarea to appear
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
// Enter text into the textarea using page.evaluate // Enter text into the textarea
const inputText = companyNameArg; const inputText = companyNameArg;
await page.evaluate((text) => { await page.evaluate((text) => {
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter'); const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
@ -178,117 +265,75 @@ export class HandelsRegister {
const businessRecords = await this.waitForResults(page); const businessRecords = await this.waitForResults(page);
// Parse out the registration info
for (const record of businessRecords) { for (const record of businessRecords) {
record.germanParsedRegistration = await this.parseGermanRegistration(record.registrationId); if (record.registrationId) {
record.germanParsedRegistration = await this.parseGermanRegistration(
record.registrationId
);
}
} }
await page.close(); await page.close();
// Finally, we return an object, which triggers a JSON file download
return businessRecords; return businessRecords;
}, 60000);
} }
/**
* Search for a specific company (known register type/number/court),
* then click on an element that triggers a file download.
*/
public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) { public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) {
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
const page = await this.getNewPage(); const page = await this.getNewPage();
await this.navigateToPage(page, 'Normal search'); await this.navigateToPage(page, 'Normal search');
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
// 1) Type of Register: // 1) Type of Register (e.g. HRB, HRA, etc.)
// Open the dropdown to reveal options
await page.waitForSelector('#form\\:registerArt_label'); await page.waitForSelector('#form\\:registerArt_label');
await page.click('#form\\:registerArt_label'); // Open the dropdown await page.click('#form\\:registerArt_label');
await page.waitForSelector('#form\\:registerArt_items');
// Wait for the options and select the one matching companyArg.type
await page.waitForSelector('#form\\:registerArt_items'); // Ensure dropdown options are loaded
await page.evaluate((type) => { await page.evaluate((type) => {
const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li')); const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li'));
const targetOption = options.find((option) => option.textContent?.trim() === type); // Match type dynamically const targetOption = options.find((option) => option.textContent?.trim() === type);
(targetOption as any)?.click(); (targetOption as HTMLElement)?.click();
}, companyArg.type); // Pass companyArg.type to the browser context }, companyArg.type);
// 2) Register number: // 2) Register number
// Fill in the register number
await page.waitForSelector('#form\\:registerNummer'); await page.waitForSelector('#form\\:registerNummer');
await page.type('#form\\:registerNummer', companyArg.number); await page.type('#form\\:registerNummer', companyArg.number);
// 3) Register court: // 3) Register court
// Open the dropdown for the register court
await page.waitForSelector('#form\\:registergericht_label'); await page.waitForSelector('#form\\:registergericht_label');
await page.click('#form\\:registergericht_label'); // Open the dropdown await page.click('#form\\:registergericht_label');
await page.waitForSelector('#form\\:registergericht_items');
// Wait for the options and select the one matching companyArg.court
await page.waitForSelector('#form\\:registergericht_items'); // Ensure dropdown options are loaded
await page.evaluate((court) => { await page.evaluate((court) => {
const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li')); const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li'));
const targetOption = options.find((option) => option.textContent?.trim() === court); // Match court dynamically const targetOption = options.find((option) => option.textContent?.trim() === court);
(targetOption as any)?.click(); (targetOption as HTMLElement)?.click();
}, companyArg.court); // Pass companyArg.court to the browser context }, companyArg.court);
// Click 'Find'
await this.clickFindButton(page); await this.clickFindButton(page);
// Optionally grab the results, just for logging
const businessRecords = await this.waitForResults(page); const businessRecords = await this.waitForResults(page);
console.log(businessRecords); console.log(businessRecords);
// Define the response listener const files: plugins.smartfile.SmartFile[] = [];
const responseListener = async (
response: plugins.smartbrowser.smartpuppeteer.puppeteer.HTTPResponse
) => {
// Ignore preflight (OPTIONS) requests
if (response.request().method() === 'OPTIONS') {
console.log(`Ignoring preflight request: ${response.url()}`);
return;
}
// Check for downloads (Content-Disposition header) // download files
const contentDisposition = response.headers()['content-disposition']; files.push(await this.downloadFile(page, 'SI'));
files.push(await this.downloadFile(page, 'AD'));
if (contentDisposition && contentDisposition.includes('attachment')) { // At this point, the file should have been downloaded automatically
console.log(`Download detected: ${response.url()}`); // to the path specified by `Page.setDownloadBehavior`
try { await page.close();
const buffer = await response.buffer();
console.log(`Downloaded file size: ${buffer.length} bytes`); return {
} catch (error) { businessRecords,
console.error('Error downloading file:', error); files,
}
}
}; };
page.on('response', responseListener); }, 60000);
// Click the element
await page.evaluate(() => {
// Locate the table body
const tableBody = document.querySelector(
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data'
);
if (!tableBody) {
throw new Error('Table body not found');
}
// Locate the first row
const firstRow = tableBody.querySelector('tr:nth-child(1)');
if (!firstRow) {
throw new Error('First row not found');
}
// Locate the last cell in the first row
const lastCell = firstRow.querySelector('td:last-child');
if (!lastCell) {
throw new Error('Last cell not found in the first row');
}
// Locate the last <a> element in the last cell
const lastLink = lastCell.querySelector('a:last-of-type');
if (!lastLink) {
throw new Error('Last link not found in the last cell');
}
// Simulate a click on the last <a> element
(lastLink as HTMLElement).click();
});
// Optional: Wait for some response or navigation triggered by the click
await page.waitForTimeout(10000);
page.off('response', responseListener);
} }
} }

View File

@ -8,4 +8,8 @@ export const packageDir = plugins.path.join(
export const nogitDir = plugins.path.join(packageDir, './.nogit/'); export const nogitDir = plugins.path.join(packageDir, './.nogit/');
plugins.smartfile.fs.ensureDirSync(nogitDir); plugins.smartfile.fs.ensureDirSync(nogitDir);
export const downloadDir = plugins.path.join(nogitDir, 'downloads');
plugins.smartfile.fs.ensureDirSync(downloadDir);
export const germanBusinessDataDir = plugins.path.join(nogitDir, 'germanbusinessdata'); export const germanBusinessDataDir = plugins.path.join(nogitDir, 'germanbusinessdata');

View File

@ -6,6 +6,7 @@ export {
} }
// @push.rocks scope // @push.rocks scope
import * as lik from '@push.rocks/lik';
import * as qenv from '@push.rocks/qenv'; import * as qenv from '@push.rocks/qenv';
import * as smartarchive from '@push.rocks/smartarchive'; import * as smartarchive from '@push.rocks/smartarchive';
import * as smartbrowser from '@push.rocks/smartbrowser'; import * as smartbrowser from '@push.rocks/smartbrowser';
@ -16,8 +17,10 @@ import * as smartpath from '@push.rocks/smartpath';
import * as smartpromise from '@push.rocks/smartpromise'; import * as smartpromise from '@push.rocks/smartpromise';
import * as smartrequest from '@push.rocks/smartrequest'; import * as smartrequest from '@push.rocks/smartrequest';
import * as smartstream from '@push.rocks/smartstream'; import * as smartstream from '@push.rocks/smartstream';
import * as smartunique from '@push.rocks/smartunique';
export { export {
lik,
qenv, qenv,
smartarchive, smartarchive,
smartbrowser, smartbrowser,
@ -28,6 +31,7 @@ export {
smartpromise, smartpromise,
smartrequest, smartrequest,
smartstream, smartstream,
smartunique,
} }
// @tsclass scope // @tsclass scope