feat(core): Enhanced data handling capabilities and improved company search functionalities.

This commit is contained in:
2025-01-03 01:36:26 +01:00
parent a67a0993d6
commit c3f6ef531b
10 changed files with 287 additions and 143 deletions

View File

@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@fin.cx/opendata',
version: '1.2.1',
version: '1.3.0',
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
}

View File

@ -1,33 +1,45 @@
import * as plugins from './plugins.js';
@plugins.smartdata.Manager()
export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<BusinessRecord, BusinessRecord> {
export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
BusinessRecord,
BusinessRecord
> {
@plugins.smartdata.unI()
id: string;
@plugins.smartdata.svDb()
data: {
name?: string,
address?: string,
postalCode?: string,
city?: string,
country?: string,
phone?: string,
fax?: string,
email?: string,
website?: string,
businessType?: string,
registrationType?: 'HRA' | 'HRB';
registrationNumber?: string,
registrationCourt?: string,
legalForm?: string,
managingDirectors?: string[],
boardOfDirectors?: string[],
supervisoryBoard?: string[],
foundingDate?: string,
capital?: string,
purpose?: string,
lastUpdate?: string
name?: string;
address?: string;
postalCode?: string;
city?: string;
country?: string;
phone?: string;
fax?: string;
email?: string;
website?: string;
businessType?: string;
registrationId?: string;
germanParsedRegistration?: {
court?: string;
type?: 'HRA' | 'HRB' | 'GnR' | 'PR' | 'VR' | 'GsR';
number?: string;
};
legalForm?:
| 'GmbH'
| 'GmbH & Co. KG'
| 'AG'
| 'LLC'
| 'LLP'
| 'GmbH & Co. KGaA'
| 'GmbH & Co. KGaA, LLC';
managingDirectors?: string[];
boardOfDirectors?: string[];
supervisoryBoard?: string[];
foundingDate?: string;
capital?: string;
purpose?: string;
lastUpdate?: string;
} = {};
}
}

View File

@ -1,3 +1,4 @@
import type { BusinessRecord } from './classes.businessrecord.js';
import type { OpenData } from './classes.main.opendata.js';
import * as plugins from './plugins.js';
@ -21,21 +22,117 @@ export class HandelsRegister {
await this.smartbrowserInstance.stop();
}
/**
* Search for a company by name
*/
public async getDataForCompany(companyNameArg: string) {
// page stuff
public getNewPage = async () => {
const page = await this.smartbrowserInstance.headlessBrowser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://www.handelsregister.de/');
await page.evaluate(() => {
const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span'));
const targetElement = elements.find((el) => el.textContent?.trim() === 'Normal search');
if (targetElement) {
(targetElement as HTMLElement).click();
}
return page;
};
private navigateToPage = async (
pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page,
pageNameArg: string
) => {
try {
await pageArg.evaluate((pageNameArg2) => {
const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span'));
const targetElement = elements.find((el) => el.textContent?.trim() === pageNameArg2);
if (targetElement) {
(targetElement as HTMLElement).click();
}
}, pageNameArg);
console.log(`Navigated to the ${pageNameArg} page successfully.`);
} catch (error) {
console.error(`Failed to navigate to the ${pageNameArg} page:`, error);
}
};
private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => {
await pageArg.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
timeout: 30000,
});
const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => {
const rows = document.querySelectorAll(
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr'
);
const records: BusinessRecord['data'][] = [];
rows.forEach((row) => {
const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20');
const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span');
const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText');
const registrationCourtElement = row.querySelector(
'td.ui-panelgrid-cell.fontTableNameSize'
);
const name = nameElement?.textContent?.trim();
const city = cityElement?.textContent?.trim();
const status = statusElement?.textContent?.trim();
const registrationId = registrationCourtElement?.textContent?.trim();
// Push parsed data into records array
records.push({
name,
city,
registrationId,
businessType: status,
});
});
return records;
});
return businessRecords;
};
private clickFindButton = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => {
try {
// Wait for the button with the text "Find" to appear
await pageArg.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 });
// adjust to 100 results per page
await pageArg.select('#form\\:ergebnisseProSeite_input', '100');
// Locate and click the button using its text
await pageArg.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c'));
const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find');
if (targetButton) {
const parentButton = targetButton.closest('button') || targetButton;
(parentButton as HTMLElement).click();
}
});
console.log('Find button clicked successfully!');
} catch (error) {
console.error('Failed to find or click the "Find" button:', error);
}
};
// parsing stuff
private async parseGermanRegistration(
input: string
): Promise<BusinessRecord['data']['germanParsedRegistration']> {
const regex = /District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u;
const match = input.match(regex);
if (match) {
return {
court: match[1], // Extracts the court name
type: match[2] as 'HRA' | 'HRB', // Extracts the type and ensures it matches the specified types
number: match[3], // Extracts the number
};
}
}
/**
* Search for a company by name
*/
public async searchCompany(companyNameArg: string) {
const page = await this.getNewPage();
await this.navigateToPage(page, 'Normal search');
try {
// Wait for the textarea to appear
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
@ -76,63 +173,59 @@ export class HandelsRegister {
console.error('Failed to find or click the radio button:', error);
}
try {
// Wait for the button with the text "Find" to appear
await page.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 });
await this.clickFindButton(page);
// Locate and click the button using its text
await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c'));
const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find');
if (targetButton) {
const parentButton = targetButton.closest('button') || targetButton;
(parentButton as HTMLElement).click();
}
});
const businessRecords = await this.waitForResults(page);
console.log('Find button clicked successfully!');
} catch (error) {
console.error('Failed to find or click the "Find" button:', error);
for (const record of businessRecords) {
record.germanParsedRegistration = await this.parseGermanRegistration(record.registrationId);
}
await page.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
timeout: 10000,
});
const businessRecords = await page.evaluate(() => {
const rows = document.querySelectorAll(
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr'
);
const records = [];
rows.forEach((row) => {
const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20');
const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span');
const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText');
const registrationCourtElement = row.querySelector(
'td.ui-panelgrid-cell.fontTableNameSize'
);
const name = nameElement?.textContent?.trim();
const city = cityElement?.textContent?.trim();
const status = statusElement?.textContent?.trim();
const registrationCourt = registrationCourtElement?.textContent?.trim();
// Push parsed data into records array
records.push({
name,
city,
registrationCourt,
businessType: status,
});
});
return records;
});
await page.close();
// Finally, we return an object, which triggers a JSON file download
return businessRecords;
}
public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) {
const page = await this.getNewPage();
await this.navigateToPage(page, 'Normal search');
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
// 1) Type of Register:
// Open the dropdown to reveal options
await page.waitForSelector('#form\\:registerArt_label');
await page.click('#form\\:registerArt_label'); // Open the dropdown
// Wait for the options and select the one matching companyArg.type
await page.waitForSelector('#form\\:registerArt_items'); // Ensure dropdown options are loaded
await page.evaluate((type) => {
const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li'));
const targetOption = options.find((option) => option.textContent?.trim() === type); // Match type dynamically
(targetOption as any)?.click();
}, companyArg.type); // Pass companyArg.type to the browser context
// 2) Register number:
// Fill in the register number
await page.waitForSelector('#form\\:registerNummer');
await page.type('#form\\:registerNummer', companyArg.number);
// 3) Register court:
// Open the dropdown for the register court
await page.waitForSelector('#form\\:registergericht_label');
await page.click('#form\\:registergericht_label'); // Open the dropdown
// Wait for the options and select the one matching companyArg.court
await page.waitForSelector('#form\\:registergericht_items'); // Ensure dropdown options are loaded
await page.evaluate((court) => {
const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li'));
const targetOption = options.find((option) => option.textContent?.trim() === court); // Match court dynamically
(targetOption as any)?.click();
}, companyArg.court); // Pass companyArg.court to the browser context
await this.clickFindButton(page);
const businessRecords = await this.waitForResults(page);
console.log(businessRecords);
}
}

View File

@ -44,6 +44,8 @@ export class JsonlDataProcessor {
if (!line) continue;
try {
entry = JSON.parse(line);
console.log(JSON.stringify(entry, null, 2));
process.exit(0);
} catch (err) {
console.log(line);
await plugins.smartdelay.delayFor(10000);

View File

@ -25,6 +25,11 @@ export class OpenData {
this.handelsregister = new HandelsRegister(this);
await this.handelsregister.start();
}
public async buildInitialDb() {
await this.jsonLDataProcessor.processDataFromUrl();
}
public async stop() {
await this.db.close();
await this.handelsregister.stop();

View File

@ -28,4 +28,11 @@ export {
smartpromise,
smartrequest,
smartstream,
}
}
// @tsclass scope
import * as tsclass from '@tsclass/tsclass';
export {
tsclass,
}