feat(core): Integrate Handelsregister search for company data retrieval

This commit is contained in:
Philipp Kunz 2025-01-02 01:26:34 +01:00
parent 506a644c6b
commit 1eda50ad13
9 changed files with 180 additions and 18 deletions

View File

@ -1,5 +1,12 @@
# Changelog
## 2025-01-02 - 1.2.0 - feat(core)
Integrate Handelsregister search for company data retrieval
- Added support for searching company data via Handelsregister.
- Replaced GermanBusinessData functionality with JsonlDataProcessor.
- Included smartbrowser dependency for handling web requests to Handelsregister.
## 2025-01-01 - 1.1.5 - fix(GermanBusinessData)
Add console log for total records processed at the end of the stream.

View File

@ -24,6 +24,7 @@
"dependencies": {
"@push.rocks/qenv": "^6.1.0",
"@push.rocks/smartarchive": "^4.0.39",
"@push.rocks/smartbrowser": "^2.0.6",
"@push.rocks/smartdata": "^5.2.10",
"@push.rocks/smartdelay": "^3.0.5",
"@push.rocks/smartfile": "^11.0.23",

5
pnpm-lock.yaml generated
View File

@ -14,6 +14,9 @@ importers:
'@push.rocks/smartarchive':
specifier: ^4.0.39
version: 4.0.39
'@push.rocks/smartbrowser':
specifier: ^2.0.6
version: 2.0.6
'@push.rocks/smartdata':
specifier: ^5.2.10
version: 5.2.10(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0))(@aws-sdk/credential-providers@3.716.0(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0)))(socks@2.8.3)
@ -7496,7 +7499,7 @@ snapshots:
extract-zip@2.0.1:
dependencies:
debug: 4.3.4
debug: 4.4.0
get-stream: 5.2.0
yauzl: 2.10.0
optionalDependencies:

View File

@ -12,4 +12,14 @@ tap.test('should start the instance', async () => {
await testOpenDataInstance.start();
})
tap.test('should get the data for a company', async () => {
const result = await testOpenDataInstance.handelsregister.getDataForCompany('Volkswagen');
console.log(result);
});
tap.test('should stop the instance', async () => {
await testOpenDataInstance.stop();
});
tap.start()

View File

@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: '@fin.cx/opendata',
version: '1.1.5',
version: '1.2.0',
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
}

View File

@ -0,0 +1,138 @@
import type { OpenData } from './classes.main.opendata.js';
import * as plugins from './plugins.js';
/**
* the HandlesRegister exposed as a class
*/
export class HandelsRegister {
private openDataRef: OpenData;
public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser();
constructor(openDataRef: OpenData) {
this.openDataRef = openDataRef;
}
public async start() {
await this.smartbrowserInstance.start();
}
public async stop() {
await this.smartbrowserInstance.stop();
}
/**
* Search for a company by name
*/
public async getDataForCompany(companyNameArg: string) {
const page = await this.smartbrowserInstance.headlessBrowser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://www.handelsregister.de/');
await page.evaluate(() => {
const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span'));
const targetElement = elements.find((el) => el.textContent?.trim() === 'Normal search');
if (targetElement) {
(targetElement as HTMLElement).click();
}
});
try {
// Wait for the textarea to appear
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
// Enter text into the textarea using page.evaluate
const inputText = companyNameArg;
await page.evaluate((text) => {
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
if (textarea) {
textarea.value = text; // Set the value
// Trigger the change event manually if required
const event = new Event('change', { bubbles: true });
textarea.dispatchEvent(event);
}
}, inputText);
console.log('Text entered successfully!');
} catch (error) {
console.error('Failed to find or enter text into the textarea:', error);
}
try {
// Wait for the radio button's label to appear
await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 });
// Click the label to select the radio button
await page.evaluate(() => {
const label = document.querySelector<HTMLLabelElement>(
'label[for="form:schlagwortOptionen:0"]'
);
if (label) {
label.click();
}
});
console.log('Radio button clicked successfully!');
} catch (error) {
console.error('Failed to find or click the radio button:', error);
}
try {
// Wait for the button with the text "Find" to appear
await page.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 });
// Locate and click the button using its text
await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c'));
const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find');
if (targetButton) {
const parentButton = targetButton.closest('button') || targetButton;
(parentButton as HTMLElement).click();
}
});
console.log('Find button clicked successfully!');
} catch (error) {
console.error('Failed to find or click the "Find" button:', error);
}
await page.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
timeout: 10000,
});
const businessRecords = await page.evaluate(() => {
const rows = document.querySelectorAll(
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr'
);
const records = [];
rows.forEach((row) => {
const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20');
const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span');
const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText');
const registrationCourtElement = row.querySelector(
'td.ui-panelgrid-cell.fontTableNameSize'
);
const name = nameElement?.textContent?.trim();
const city = cityElement?.textContent?.trim();
const status = statusElement?.textContent?.trim();
const registrationCourt = registrationCourtElement?.textContent?.trim();
// Push parsed data into records array
records.push({
name,
city,
registrationCourt,
businessType: status,
});
});
return records;
});
await page.close();
// Finally, we return an object, which triggers a JSON file download
return businessRecords;
}
}

View File

@ -2,28 +2,23 @@ import * as plugins from './plugins.js';
import * as paths from './paths.js';
import type { OpenData } from './classes.main.opendata.js';
export class GermanBusinessData {
export class JsonlDataProcessor {
public openDataRef: OpenData;
constructor(openDataRefArg: OpenData) {
this.openDataRef = openDataRefArg;
}
public async start() {
await this.update();
}
public async stop() {}
public async update() {
// TODO: define a mapper as argument instead of hard-coding it
public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') {
const done = plugins.smartpromise.defer();
const promiseArray: Promise<any>[] = [];
const dataUrl = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2';
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
if (!dataExists) {
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
} else {
}
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrl);
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg);
promiseArray
.push
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
@ -60,7 +55,6 @@ export class GermanBusinessData {
businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId();
businessRecord.data.name = entry.name;
await businessRecord.save();
// console.log(`stored ${businessRecord.data.name}`);
}
},
finalFunction: async (streamToolsArg) => {

View File

@ -1,13 +1,16 @@
import { BusinessRecord } from './classes.businessrecord.js';
import { GermanBusinessData } from './classes.germanbusinessdata.js';
import { HandelsRegister } from './classes.handelsregister.js';
import { JsonlDataProcessor } from './classes.jsonldata.js';
import * as paths from './paths.js';
import * as plugins from './plugins.js';
export class OpenData {
db: plugins.smartdata.SmartdataDb;
germanBusinesses: GermanBusinessData;
public db: plugins.smartdata.SmartdataDb;
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
public jsonLDataProcessor: JsonlDataProcessor;
public handelsregister: HandelsRegister;
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
public async start() {
@ -18,8 +21,12 @@ export class OpenData {
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
});
await this.db.init();
this.germanBusinesses = new GermanBusinessData(this);
await this.germanBusinesses.start();
this.jsonLDataProcessor = new JsonlDataProcessor(this);
this.handelsregister = new HandelsRegister(this);
await this.handelsregister.start();
}
public async stop() {
await this.db.close();
await this.handelsregister.stop();
}
public async stop() {}
}

View File

@ -8,6 +8,7 @@ export {
// @push.rocks scope
import * as qenv from '@push.rocks/qenv';
import * as smartarchive from '@push.rocks/smartarchive';
import * as smartbrowser from '@push.rocks/smartbrowser';
import * as smartdata from '@push.rocks/smartdata';
import * as smartdelay from '@push.rocks/smartdelay';
import * as smartfile from '@push.rocks/smartfile';
@ -19,6 +20,7 @@ import * as smartstream from '@push.rocks/smartstream';
export {
qenv,
smartarchive,
smartbrowser,
smartdata,
smartdelay,
smartfile,