From c1771934388565539d057bdf668bbad8d63ff36d Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Sat, 4 Jan 2025 02:27:53 +0100 Subject: [PATCH] feat(HandelsRegister): Add file download functionality to HandelsRegister --- changelog.md | 8 + package.json | 2 + pnpm-lock.yaml | 6 + ts/00_commitinfo_data.ts | 2 +- ts/classes.handelsregister.ts | 363 +++++++++++++++++++--------------- ts/paths.ts | 4 + ts/plugins.ts | 4 + 7 files changed, 229 insertions(+), 160 deletions(-) diff --git a/changelog.md b/changelog.md index f56dab5..3c5297b 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,13 @@ # Changelog +## 2025-01-04 - 1.4.0 - feat(HandelsRegister) +Add file download functionality to HandelsRegister + +- Implemented file download feature in the HandelsRegister class. +- Configured pages in Puppeteer to allow downloads and set download paths. +- Parsed German registration information with more robust error handling. +- Added specific methods for downloading and handling 'SI' and 'AD' files. + ## 2025-01-03 - 1.3.1 - fix(HandelsRegister) Refined HandelsRegister functionality for better error handling and response capture. diff --git a/package.json b/package.json index f41f968..c9cceb4 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "@types/node": "^22.10.4" }, "dependencies": { + "@push.rocks/lik": "^6.1.0", "@push.rocks/qenv": "^6.1.0", "@push.rocks/smartarchive": "^4.0.39", "@push.rocks/smartbrowser": "^2.0.8", @@ -32,6 +33,7 @@ "@push.rocks/smartpromise": "^4.0.4", "@push.rocks/smartrequest": "^2.0.23", "@push.rocks/smartstream": "^3.2.5", + "@push.rocks/smartunique": "^3.0.9", "@tsclass/tsclass": "^4.2.0" }, "repository": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4787c15..c329c97 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,6 +8,9 @@ importers: .: dependencies: + '@push.rocks/lik': + specifier: ^6.1.0 + version: 6.1.0 '@push.rocks/qenv': specifier: ^6.1.0 version: 6.1.0 @@ -38,6 +41,9 @@ importers: '@push.rocks/smartstream': specifier: ^3.2.5 version: 3.2.5 + '@push.rocks/smartunique': + specifier: ^3.0.9 + version: 3.0.9 '@tsclass/tsclass': specifier: ^4.2.0 version: 4.2.0 diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 0b71923..f0f20c7 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@fin.cx/opendata', - version: '1.3.1', + version: '1.4.0', description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.' } diff --git a/ts/classes.handelsregister.ts b/ts/classes.handelsregister.ts index 11bb7c1..38ff20a 100644 --- a/ts/classes.handelsregister.ts +++ b/ts/classes.handelsregister.ts @@ -1,13 +1,17 @@ import type { BusinessRecord } from './classes.businessrecord.js'; import type { OpenData } from './classes.main.opendata.js'; import * as plugins from './plugins.js'; +import * as paths from './paths.js'; /** * the HandlesRegister exposed as a class */ export class HandelsRegister { private openDataRef: OpenData; + private asyncExecutionStack = new plugins.lik.AsyncExecutionStack(); + private uniqueDowloadFolder = plugins.path.join(paths.downloadDir, plugins.smartunique.uniSimple()); + // Puppeteer wrapper instance public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser(); constructor(openDataRef: OpenData) { @@ -15,16 +19,34 @@ export class HandelsRegister { } public async start() { + // Start the browser + await plugins.smartfile.fs.ensureDir(this.uniqueDowloadFolder); await this.smartbrowserInstance.start(); } public async stop() { + // Stop the browser + await plugins.smartfile.fs.remove(this.uniqueDowloadFolder); await this.smartbrowserInstance.stop(); } - // page stuff + /** + * Creates a new page and configures it to allow file downloads + * to a predefined path. + */ public getNewPage = async () => { const page = await this.smartbrowserInstance.headlessBrowser.newPage(); + + // 1) Create a DevTools session for this page + const cdpSession = await page.target().createCDPSession(); + + // 2) Allow file downloads and set the download path + await cdpSession.send('Page.setDownloadBehavior', { + behavior: 'allow', + downloadPath: this.uniqueDowloadFolder, // <-- Change this to your desired absolute path + }); + + // Optionally set viewport and go to page await page.setViewport({ width: 1920, height: 1080 }); await page.goto('https://www.handelsregister.de/'); return page; @@ -49,9 +71,14 @@ export class HandelsRegister { }; private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => { - await pageArg.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', { - timeout: 30000, - }); + await pageArg + .waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', { + timeout: 30000, + }) + .catch(async (err) => { + await pageArg.screenshot({ path: paths.downloadDir + '/error.png' }); + throw err; + }); const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => { const rows = document.querySelectorAll( @@ -110,152 +137,12 @@ export class HandelsRegister { } }; - // parsing stuff - private async parseGermanRegistration( - input: string - ): Promise { - const regex = - /District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u; - const match = input.match(regex); - - if (match) { - return { - court: match[1], // Extracts the court name - type: match[2] as 'HRA' | 'HRB', // Extracts the type and ensures it matches the specified types - number: match[3], // Extracts the number - }; - } - } - - /** - * Search for a company by name - */ - public async searchCompany(companyNameArg: string) { - const page = await this.getNewPage(); - await this.navigateToPage(page, 'Normal search'); - - try { - // Wait for the textarea to appear - await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); - - // Enter text into the textarea using page.evaluate - const inputText = companyNameArg; - await page.evaluate((text) => { - const textarea = document.querySelector('#form\\:schlagwoerter'); - if (textarea) { - textarea.value = text; // Set the value - // Trigger the change event manually if required - const event = new Event('change', { bubbles: true }); - textarea.dispatchEvent(event); - } - }, inputText); - - console.log('Text entered successfully!'); - } catch (error) { - console.error('Failed to find or enter text into the textarea:', error); - } - - try { - // Wait for the radio button's label to appear - await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 }); - - // Click the label to select the radio button - await page.evaluate(() => { - const label = document.querySelector( - 'label[for="form:schlagwortOptionen:0"]' - ); - if (label) { - label.click(); - } - }); - - console.log('Radio button clicked successfully!'); - } catch (error) { - console.error('Failed to find or click the radio button:', error); - } - - await this.clickFindButton(page); - - const businessRecords = await this.waitForResults(page); - - for (const record of businessRecords) { - record.germanParsedRegistration = await this.parseGermanRegistration(record.registrationId); - } - - await page.close(); - - // Finally, we return an object, which triggers a JSON file download - return businessRecords; - } - - public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) { - const page = await this.getNewPage(); - await this.navigateToPage(page, 'Normal search'); - await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); - - // 1) Type of Register: - // Open the dropdown to reveal options - await page.waitForSelector('#form\\:registerArt_label'); - await page.click('#form\\:registerArt_label'); // Open the dropdown - - // Wait for the options and select the one matching companyArg.type - await page.waitForSelector('#form\\:registerArt_items'); // Ensure dropdown options are loaded - await page.evaluate((type) => { - const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li')); - const targetOption = options.find((option) => option.textContent?.trim() === type); // Match type dynamically - (targetOption as any)?.click(); - }, companyArg.type); // Pass companyArg.type to the browser context - - // 2) Register number: - // Fill in the register number - await page.waitForSelector('#form\\:registerNummer'); - await page.type('#form\\:registerNummer', companyArg.number); - - // 3) Register court: - // Open the dropdown for the register court - await page.waitForSelector('#form\\:registergericht_label'); - await page.click('#form\\:registergericht_label'); // Open the dropdown - - // Wait for the options and select the one matching companyArg.court - await page.waitForSelector('#form\\:registergericht_items'); // Ensure dropdown options are loaded - await page.evaluate((court) => { - const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li')); - const targetOption = options.find((option) => option.textContent?.trim() === court); // Match court dynamically - (targetOption as any)?.click(); - }, companyArg.court); // Pass companyArg.court to the browser context - - await this.clickFindButton(page); - - const businessRecords = await this.waitForResults(page); - console.log(businessRecords); - - // Define the response listener - const responseListener = async ( - response: plugins.smartbrowser.smartpuppeteer.puppeteer.HTTPResponse - ) => { - // Ignore preflight (OPTIONS) requests - if (response.request().method() === 'OPTIONS') { - console.log(`Ignoring preflight request: ${response.url()}`); - return; - } - - // Check for downloads (Content-Disposition header) - const contentDisposition = response.headers()['content-disposition']; - - if (contentDisposition && contentDisposition.includes('attachment')) { - console.log(`Download detected: ${response.url()}`); - try { - const buffer = await response.buffer(); - console.log(`Downloaded file size: ${buffer.length} bytes`); - } catch (error) { - console.error('Error downloading file:', error); - } - } - }; - page.on('response', responseListener); - - // Click the element - await page.evaluate(() => { + private async downloadFile( + pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page, + typeArg: 'SI' | 'AD' + ) { + // Trigger the file download by clicking on the relevant link + await pageArg.evaluate((typeArg2) => { // Locate the table body const tableBody = document.querySelector( '#ergebnissForm\\:selectedSuchErgebnisFormTable_data' @@ -276,19 +163,177 @@ export class HandelsRegister { throw new Error('Last cell not found in the first row'); } - // Locate the last element in the last cell - const lastLink = lastCell.querySelector('a:last-of-type'); - if (!lastLink) { - throw new Error('Last link not found in the last cell'); + // Locate the download links + const adLink = lastCell.querySelector('a:first-of-type'); + const siLink = lastCell.querySelector('a:last-of-type'); + if (!siLink) { + throw new Error('SI link not found in the last cell'); } // Simulate a click on the last element - (lastLink as HTMLElement).click(); - }); + switch (typeArg2) { + case 'AD': + (adLink as HTMLElement).click(); + break; + case 'SI': + (siLink as HTMLElement).click(); + break; + default: + throw new Error('Invalid file type'); + } + }, typeArg); - // Optional: Wait for some response or navigation triggered by the click - await page.waitForTimeout(10000); + // Wait a bit for the download to complete (you might want to implement + // a more robust file-exists check or a wait-for-download library) + await pageArg.waitForTimeout(10000); - page.off('response', responseListener); + const files = await plugins.smartfile.fs.fileTreeToObject(this.uniqueDowloadFolder, '**/*'); + await plugins.smartfile.fs.ensureEmptyDir(this.uniqueDowloadFolder); + + return files [0]; + } + + /** + * Helper method to parse the German registration string + */ + private async parseGermanRegistration( + input: string + ): Promise { + // e.g. District court Berlin (Charlottenburg) HRB 123456 + const regex = + /District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u; + const match = input.match(regex); + + if (match) { + return { + court: match[1], + type: match[2] as 'HRA' | 'HRB', // Adjust if needed + number: match[3], + }; + } + } + + /** + * Search for a company by name and return basic info + */ + public async searchCompany(companyNameArg: string) { + return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => { + const page = await this.getNewPage(); + await this.navigateToPage(page, 'Normal search'); + + try { + // Wait for the textarea to appear + await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); + + // Enter text into the textarea + const inputText = companyNameArg; + await page.evaluate((text) => { + const textarea = document.querySelector('#form\\:schlagwoerter'); + if (textarea) { + textarea.value = text; // Set the value + // Trigger the change event manually if required + const event = new Event('change', { bubbles: true }); + textarea.dispatchEvent(event); + } + }, inputText); + + console.log('Text entered successfully!'); + } catch (error) { + console.error('Failed to find or enter text into the textarea:', error); + } + + try { + // Wait for the radio button's label to appear + await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 }); + + // Click the label to select the radio button + await page.evaluate(() => { + const label = document.querySelector( + 'label[for="form:schlagwortOptionen:0"]' + ); + if (label) { + label.click(); + } + }); + + console.log('Radio button clicked successfully!'); + } catch (error) { + console.error('Failed to find or click the radio button:', error); + } + + await this.clickFindButton(page); + + const businessRecords = await this.waitForResults(page); + + // Parse out the registration info + for (const record of businessRecords) { + if (record.registrationId) { + record.germanParsedRegistration = await this.parseGermanRegistration( + record.registrationId + ); + } + } + + await page.close(); + return businessRecords; + }, 60000); + } + + /** + * Search for a specific company (known register type/number/court), + * then click on an element that triggers a file download. + */ + public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) { + return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => { + const page = await this.getNewPage(); + await this.navigateToPage(page, 'Normal search'); + await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); + + // 1) Type of Register (e.g. HRB, HRA, etc.) + await page.waitForSelector('#form\\:registerArt_label'); + await page.click('#form\\:registerArt_label'); + await page.waitForSelector('#form\\:registerArt_items'); + await page.evaluate((type) => { + const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li')); + const targetOption = options.find((option) => option.textContent?.trim() === type); + (targetOption as HTMLElement)?.click(); + }, companyArg.type); + + // 2) Register number + await page.waitForSelector('#form\\:registerNummer'); + await page.type('#form\\:registerNummer', companyArg.number); + + // 3) Register court + await page.waitForSelector('#form\\:registergericht_label'); + await page.click('#form\\:registergericht_label'); + await page.waitForSelector('#form\\:registergericht_items'); + await page.evaluate((court) => { + const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li')); + const targetOption = options.find((option) => option.textContent?.trim() === court); + (targetOption as HTMLElement)?.click(); + }, companyArg.court); + + // Click 'Find' + await this.clickFindButton(page); + + // Optionally grab the results, just for logging + const businessRecords = await this.waitForResults(page); + console.log(businessRecords); + + const files: plugins.smartfile.SmartFile[] = []; + + // download files + files.push(await this.downloadFile(page, 'SI')); + files.push(await this.downloadFile(page, 'AD')); + + // At this point, the file should have been downloaded automatically + // to the path specified by `Page.setDownloadBehavior` + await page.close(); + + return { + businessRecords, + files, + }; + }, 60000); } } diff --git a/ts/paths.ts b/ts/paths.ts index 30ed203..7ae78a1 100644 --- a/ts/paths.ts +++ b/ts/paths.ts @@ -8,4 +8,8 @@ export const packageDir = plugins.path.join( export const nogitDir = plugins.path.join(packageDir, './.nogit/'); plugins.smartfile.fs.ensureDirSync(nogitDir); +export const downloadDir = plugins.path.join(nogitDir, 'downloads'); +plugins.smartfile.fs.ensureDirSync(downloadDir); + + export const germanBusinessDataDir = plugins.path.join(nogitDir, 'germanbusinessdata'); \ No newline at end of file diff --git a/ts/plugins.ts b/ts/plugins.ts index ce80539..608004a 100644 --- a/ts/plugins.ts +++ b/ts/plugins.ts @@ -6,6 +6,7 @@ export { } // @push.rocks scope +import * as lik from '@push.rocks/lik'; import * as qenv from '@push.rocks/qenv'; import * as smartarchive from '@push.rocks/smartarchive'; import * as smartbrowser from '@push.rocks/smartbrowser'; @@ -16,8 +17,10 @@ import * as smartpath from '@push.rocks/smartpath'; import * as smartpromise from '@push.rocks/smartpromise'; import * as smartrequest from '@push.rocks/smartrequest'; import * as smartstream from '@push.rocks/smartstream'; +import * as smartunique from '@push.rocks/smartunique'; export { + lik, qenv, smartarchive, smartbrowser, @@ -28,6 +31,7 @@ export { smartpromise, smartrequest, smartstream, + smartunique, } // @tsclass scope