feat(HandelsRegister): Add file download functionality to HandelsRegister
This commit is contained in:
@ -1,13 +1,17 @@
|
||||
import type { BusinessRecord } from './classes.businessrecord.js';
|
||||
import type { OpenData } from './classes.main.opendata.js';
|
||||
import * as plugins from './plugins.js';
|
||||
import * as paths from './paths.js';
|
||||
|
||||
/**
|
||||
* the HandlesRegister exposed as a class
|
||||
*/
|
||||
export class HandelsRegister {
|
||||
private openDataRef: OpenData;
|
||||
private asyncExecutionStack = new plugins.lik.AsyncExecutionStack();
|
||||
private uniqueDowloadFolder = plugins.path.join(paths.downloadDir, plugins.smartunique.uniSimple());
|
||||
|
||||
// Puppeteer wrapper instance
|
||||
public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser();
|
||||
|
||||
constructor(openDataRef: OpenData) {
|
||||
@ -15,16 +19,34 @@ export class HandelsRegister {
|
||||
}
|
||||
|
||||
public async start() {
|
||||
// Start the browser
|
||||
await plugins.smartfile.fs.ensureDir(this.uniqueDowloadFolder);
|
||||
await this.smartbrowserInstance.start();
|
||||
}
|
||||
|
||||
public async stop() {
|
||||
// Stop the browser
|
||||
await plugins.smartfile.fs.remove(this.uniqueDowloadFolder);
|
||||
await this.smartbrowserInstance.stop();
|
||||
}
|
||||
|
||||
// page stuff
|
||||
/**
|
||||
* Creates a new page and configures it to allow file downloads
|
||||
* to a predefined path.
|
||||
*/
|
||||
public getNewPage = async () => {
|
||||
const page = await this.smartbrowserInstance.headlessBrowser.newPage();
|
||||
|
||||
// 1) Create a DevTools session for this page
|
||||
const cdpSession = await page.target().createCDPSession();
|
||||
|
||||
// 2) Allow file downloads and set the download path
|
||||
await cdpSession.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath: this.uniqueDowloadFolder, // <-- Change this to your desired absolute path
|
||||
});
|
||||
|
||||
// Optionally set viewport and go to page
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.goto('https://www.handelsregister.de/');
|
||||
return page;
|
||||
@ -49,9 +71,14 @@ export class HandelsRegister {
|
||||
};
|
||||
|
||||
private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => {
|
||||
await pageArg.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
|
||||
timeout: 30000,
|
||||
});
|
||||
await pageArg
|
||||
.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
|
||||
timeout: 30000,
|
||||
})
|
||||
.catch(async (err) => {
|
||||
await pageArg.screenshot({ path: paths.downloadDir + '/error.png' });
|
||||
throw err;
|
||||
});
|
||||
|
||||
const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => {
|
||||
const rows = document.querySelectorAll(
|
||||
@ -110,152 +137,12 @@ export class HandelsRegister {
|
||||
}
|
||||
};
|
||||
|
||||
// parsing stuff
|
||||
private async parseGermanRegistration(
|
||||
input: string
|
||||
): Promise<BusinessRecord['data']['germanParsedRegistration']> {
|
||||
const regex =
|
||||
/District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u;
|
||||
const match = input.match(regex);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
court: match[1], // Extracts the court name
|
||||
type: match[2] as 'HRA' | 'HRB', // Extracts the type and ensures it matches the specified types
|
||||
number: match[3], // Extracts the number
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for a company by name
|
||||
*/
|
||||
public async searchCompany(companyNameArg: string) {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
|
||||
try {
|
||||
// Wait for the textarea to appear
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// Enter text into the textarea using page.evaluate
|
||||
const inputText = companyNameArg;
|
||||
await page.evaluate((text) => {
|
||||
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
|
||||
if (textarea) {
|
||||
textarea.value = text; // Set the value
|
||||
// Trigger the change event manually if required
|
||||
const event = new Event('change', { bubbles: true });
|
||||
textarea.dispatchEvent(event);
|
||||
}
|
||||
}, inputText);
|
||||
|
||||
console.log('Text entered successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or enter text into the textarea:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
// Wait for the radio button's label to appear
|
||||
await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 });
|
||||
|
||||
// Click the label to select the radio button
|
||||
await page.evaluate(() => {
|
||||
const label = document.querySelector<HTMLLabelElement>(
|
||||
'label[for="form:schlagwortOptionen:0"]'
|
||||
);
|
||||
if (label) {
|
||||
label.click();
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Radio button clicked successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or click the radio button:', error);
|
||||
}
|
||||
|
||||
await this.clickFindButton(page);
|
||||
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
|
||||
for (const record of businessRecords) {
|
||||
record.germanParsedRegistration = await this.parseGermanRegistration(record.registrationId);
|
||||
}
|
||||
|
||||
await page.close();
|
||||
|
||||
// Finally, we return an object, which triggers a JSON file download
|
||||
return businessRecords;
|
||||
}
|
||||
|
||||
public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// 1) Type of Register:
|
||||
// Open the dropdown to reveal options
|
||||
await page.waitForSelector('#form\\:registerArt_label');
|
||||
await page.click('#form\\:registerArt_label'); // Open the dropdown
|
||||
|
||||
// Wait for the options and select the one matching companyArg.type
|
||||
await page.waitForSelector('#form\\:registerArt_items'); // Ensure dropdown options are loaded
|
||||
await page.evaluate((type) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === type); // Match type dynamically
|
||||
(targetOption as any)?.click();
|
||||
}, companyArg.type); // Pass companyArg.type to the browser context
|
||||
|
||||
// 2) Register number:
|
||||
// Fill in the register number
|
||||
await page.waitForSelector('#form\\:registerNummer');
|
||||
await page.type('#form\\:registerNummer', companyArg.number);
|
||||
|
||||
// 3) Register court:
|
||||
// Open the dropdown for the register court
|
||||
await page.waitForSelector('#form\\:registergericht_label');
|
||||
await page.click('#form\\:registergericht_label'); // Open the dropdown
|
||||
|
||||
// Wait for the options and select the one matching companyArg.court
|
||||
await page.waitForSelector('#form\\:registergericht_items'); // Ensure dropdown options are loaded
|
||||
await page.evaluate((court) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === court); // Match court dynamically
|
||||
(targetOption as any)?.click();
|
||||
}, companyArg.court); // Pass companyArg.court to the browser context
|
||||
|
||||
await this.clickFindButton(page);
|
||||
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
console.log(businessRecords);
|
||||
|
||||
// Define the response listener
|
||||
const responseListener = async (
|
||||
response: plugins.smartbrowser.smartpuppeteer.puppeteer.HTTPResponse
|
||||
) => {
|
||||
// Ignore preflight (OPTIONS) requests
|
||||
if (response.request().method() === 'OPTIONS') {
|
||||
console.log(`Ignoring preflight request: ${response.url()}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for downloads (Content-Disposition header)
|
||||
const contentDisposition = response.headers()['content-disposition'];
|
||||
|
||||
if (contentDisposition && contentDisposition.includes('attachment')) {
|
||||
console.log(`Download detected: ${response.url()}`);
|
||||
try {
|
||||
const buffer = await response.buffer();
|
||||
console.log(`Downloaded file size: ${buffer.length} bytes`);
|
||||
} catch (error) {
|
||||
console.error('Error downloading file:', error);
|
||||
}
|
||||
}
|
||||
};
|
||||
page.on('response', responseListener);
|
||||
|
||||
// Click the element
|
||||
await page.evaluate(() => {
|
||||
private async downloadFile(
|
||||
pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page,
|
||||
typeArg: 'SI' | 'AD'
|
||||
) {
|
||||
// Trigger the file download by clicking on the relevant link
|
||||
await pageArg.evaluate((typeArg2) => {
|
||||
// Locate the table body
|
||||
const tableBody = document.querySelector(
|
||||
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data'
|
||||
@ -276,19 +163,177 @@ export class HandelsRegister {
|
||||
throw new Error('Last cell not found in the first row');
|
||||
}
|
||||
|
||||
// Locate the last <a> element in the last cell
|
||||
const lastLink = lastCell.querySelector('a:last-of-type');
|
||||
if (!lastLink) {
|
||||
throw new Error('Last link not found in the last cell');
|
||||
// Locate the download links
|
||||
const adLink = lastCell.querySelector('a:first-of-type');
|
||||
const siLink = lastCell.querySelector('a:last-of-type');
|
||||
if (!siLink) {
|
||||
throw new Error('SI link not found in the last cell');
|
||||
}
|
||||
|
||||
// Simulate a click on the last <a> element
|
||||
(lastLink as HTMLElement).click();
|
||||
});
|
||||
switch (typeArg2) {
|
||||
case 'AD':
|
||||
(adLink as HTMLElement).click();
|
||||
break;
|
||||
case 'SI':
|
||||
(siLink as HTMLElement).click();
|
||||
break;
|
||||
default:
|
||||
throw new Error('Invalid file type');
|
||||
}
|
||||
}, typeArg);
|
||||
|
||||
// Optional: Wait for some response or navigation triggered by the click
|
||||
await page.waitForTimeout(10000);
|
||||
// Wait a bit for the download to complete (you might want to implement
|
||||
// a more robust file-exists check or a wait-for-download library)
|
||||
await pageArg.waitForTimeout(10000);
|
||||
|
||||
page.off('response', responseListener);
|
||||
const files = await plugins.smartfile.fs.fileTreeToObject(this.uniqueDowloadFolder, '**/*');
|
||||
await plugins.smartfile.fs.ensureEmptyDir(this.uniqueDowloadFolder);
|
||||
|
||||
return files [0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to parse the German registration string
|
||||
*/
|
||||
private async parseGermanRegistration(
|
||||
input: string
|
||||
): Promise<BusinessRecord['data']['germanParsedRegistration']> {
|
||||
// e.g. District court Berlin (Charlottenburg) HRB 123456
|
||||
const regex =
|
||||
/District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u;
|
||||
const match = input.match(regex);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
court: match[1],
|
||||
type: match[2] as 'HRA' | 'HRB', // Adjust if needed
|
||||
number: match[3],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for a company by name and return basic info
|
||||
*/
|
||||
public async searchCompany(companyNameArg: string) {
|
||||
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
|
||||
try {
|
||||
// Wait for the textarea to appear
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// Enter text into the textarea
|
||||
const inputText = companyNameArg;
|
||||
await page.evaluate((text) => {
|
||||
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
|
||||
if (textarea) {
|
||||
textarea.value = text; // Set the value
|
||||
// Trigger the change event manually if required
|
||||
const event = new Event('change', { bubbles: true });
|
||||
textarea.dispatchEvent(event);
|
||||
}
|
||||
}, inputText);
|
||||
|
||||
console.log('Text entered successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or enter text into the textarea:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
// Wait for the radio button's label to appear
|
||||
await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 });
|
||||
|
||||
// Click the label to select the radio button
|
||||
await page.evaluate(() => {
|
||||
const label = document.querySelector<HTMLLabelElement>(
|
||||
'label[for="form:schlagwortOptionen:0"]'
|
||||
);
|
||||
if (label) {
|
||||
label.click();
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Radio button clicked successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or click the radio button:', error);
|
||||
}
|
||||
|
||||
await this.clickFindButton(page);
|
||||
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
|
||||
// Parse out the registration info
|
||||
for (const record of businessRecords) {
|
||||
if (record.registrationId) {
|
||||
record.germanParsedRegistration = await this.parseGermanRegistration(
|
||||
record.registrationId
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await page.close();
|
||||
return businessRecords;
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for a specific company (known register type/number/court),
|
||||
* then click on an element that triggers a file download.
|
||||
*/
|
||||
public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) {
|
||||
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// 1) Type of Register (e.g. HRB, HRA, etc.)
|
||||
await page.waitForSelector('#form\\:registerArt_label');
|
||||
await page.click('#form\\:registerArt_label');
|
||||
await page.waitForSelector('#form\\:registerArt_items');
|
||||
await page.evaluate((type) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === type);
|
||||
(targetOption as HTMLElement)?.click();
|
||||
}, companyArg.type);
|
||||
|
||||
// 2) Register number
|
||||
await page.waitForSelector('#form\\:registerNummer');
|
||||
await page.type('#form\\:registerNummer', companyArg.number);
|
||||
|
||||
// 3) Register court
|
||||
await page.waitForSelector('#form\\:registergericht_label');
|
||||
await page.click('#form\\:registergericht_label');
|
||||
await page.waitForSelector('#form\\:registergericht_items');
|
||||
await page.evaluate((court) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === court);
|
||||
(targetOption as HTMLElement)?.click();
|
||||
}, companyArg.court);
|
||||
|
||||
// Click 'Find'
|
||||
await this.clickFindButton(page);
|
||||
|
||||
// Optionally grab the results, just for logging
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
console.log(businessRecords);
|
||||
|
||||
const files: plugins.smartfile.SmartFile[] = [];
|
||||
|
||||
// download files
|
||||
files.push(await this.downloadFile(page, 'SI'));
|
||||
files.push(await this.downloadFile(page, 'AD'));
|
||||
|
||||
// At this point, the file should have been downloaded automatically
|
||||
// to the path specified by `Page.setDownloadBehavior`
|
||||
await page.close();
|
||||
|
||||
return {
|
||||
businessRecords,
|
||||
files,
|
||||
};
|
||||
}, 60000);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user