From 1eda50ad13786a9c95868aef2b421d91be40ddd5 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Thu, 2 Jan 2025 01:26:34 +0100 Subject: [PATCH] feat(core): Integrate Handelsregister search for company data retrieval --- changelog.md | 7 + package.json | 1 + pnpm-lock.yaml | 5 +- test/test.ts | 10 ++ ts/00_commitinfo_data.ts | 2 +- ts/classes.handelsregister.ts | 138 ++++++++++++++++++ ...anbusinessdata.ts => classes.jsonldata.ts} | 14 +- ts/classes.main.opendata.ts | 19 ++- ts/plugins.ts | 2 + 9 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 ts/classes.handelsregister.ts rename ts/{classes.germanbusinessdata.ts => classes.jsonldata.ts} (90%) diff --git a/changelog.md b/changelog.md index edf66f0..e834c32 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # Changelog +## 2025-01-02 - 1.2.0 - feat(core) +Integrate Handelsregister search for company data retrieval + +- Added support for searching company data via Handelsregister. +- Replaced GermanBusinessData functionality with JsonlDataProcessor. +- Included smartbrowser dependency for handling web requests to Handelsregister. + ## 2025-01-01 - 1.1.5 - fix(GermanBusinessData) Add console log for total records processed at the end of the stream. diff --git a/package.json b/package.json index 0ae8398..dfdb396 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,7 @@ "dependencies": { "@push.rocks/qenv": "^6.1.0", "@push.rocks/smartarchive": "^4.0.39", + "@push.rocks/smartbrowser": "^2.0.6", "@push.rocks/smartdata": "^5.2.10", "@push.rocks/smartdelay": "^3.0.5", "@push.rocks/smartfile": "^11.0.23", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 04a0251..2b667ea 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: '@push.rocks/smartarchive': specifier: ^4.0.39 version: 4.0.39 + '@push.rocks/smartbrowser': + specifier: ^2.0.6 + version: 2.0.6 '@push.rocks/smartdata': specifier: ^5.2.10 version: 5.2.10(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0))(@aws-sdk/credential-providers@3.716.0(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0)))(socks@2.8.3) @@ -7496,7 +7499,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.4.0 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: diff --git a/test/test.ts b/test/test.ts index fc5d523..1bdc60c 100644 --- a/test/test.ts +++ b/test/test.ts @@ -12,4 +12,14 @@ tap.test('should start the instance', async () => { await testOpenDataInstance.start(); }) +tap.test('should get the data for a company', async () => { + const result = await testOpenDataInstance.handelsregister.getDataForCompany('Volkswagen'); + console.log(result); +}); + +tap.test('should stop the instance', async () => { + await testOpenDataInstance.stop(); +}); + + tap.start() diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index ad6bef0..cb8700f 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@fin.cx/opendata', - version: '1.1.5', + version: '1.2.0', description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.' } diff --git a/ts/classes.handelsregister.ts b/ts/classes.handelsregister.ts new file mode 100644 index 0000000..76854ef --- /dev/null +++ b/ts/classes.handelsregister.ts @@ -0,0 +1,138 @@ +import type { OpenData } from './classes.main.opendata.js'; +import * as plugins from './plugins.js'; + +/** + * the HandlesRegister exposed as a class + */ +export class HandelsRegister { + private openDataRef: OpenData; + + public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser(); + + constructor(openDataRef: OpenData) { + this.openDataRef = openDataRef; + } + + public async start() { + await this.smartbrowserInstance.start(); + } + + public async stop() { + await this.smartbrowserInstance.stop(); + } + + /** + * Search for a company by name + */ + public async getDataForCompany(companyNameArg: string) { + const page = await this.smartbrowserInstance.headlessBrowser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + await page.goto('https://www.handelsregister.de/'); + await page.evaluate(() => { + const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span')); + const targetElement = elements.find((el) => el.textContent?.trim() === 'Normal search'); + if (targetElement) { + (targetElement as HTMLElement).click(); + } + }); + + try { + // Wait for the textarea to appear + await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 }); + + // Enter text into the textarea using page.evaluate + const inputText = companyNameArg; + await page.evaluate((text) => { + const textarea = document.querySelector('#form\\:schlagwoerter'); + if (textarea) { + textarea.value = text; // Set the value + // Trigger the change event manually if required + const event = new Event('change', { bubbles: true }); + textarea.dispatchEvent(event); + } + }, inputText); + + console.log('Text entered successfully!'); + } catch (error) { + console.error('Failed to find or enter text into the textarea:', error); + } + + try { + // Wait for the radio button's label to appear + await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 }); + + // Click the label to select the radio button + await page.evaluate(() => { + const label = document.querySelector( + 'label[for="form:schlagwortOptionen:0"]' + ); + if (label) { + label.click(); + } + }); + + console.log('Radio button clicked successfully!'); + } catch (error) { + console.error('Failed to find or click the radio button:', error); + } + + try { + // Wait for the button with the text "Find" to appear + await page.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 }); + + // Locate and click the button using its text + await page.evaluate(() => { + const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c')); + const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find'); + if (targetButton) { + const parentButton = targetButton.closest('button') || targetButton; + (parentButton as HTMLElement).click(); + } + }); + + console.log('Find button clicked successfully!'); + } catch (error) { + console.error('Failed to find or click the "Find" button:', error); + } + + await page.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', { + timeout: 10000, + }); + + const businessRecords = await page.evaluate(() => { + const rows = document.querySelectorAll( + '#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr' + ); + const records = []; + + rows.forEach((row) => { + const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20'); + const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span'); + const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText'); + const registrationCourtElement = row.querySelector( + 'td.ui-panelgrid-cell.fontTableNameSize' + ); + + const name = nameElement?.textContent?.trim(); + const city = cityElement?.textContent?.trim(); + const status = statusElement?.textContent?.trim(); + const registrationCourt = registrationCourtElement?.textContent?.trim(); + + // Push parsed data into records array + records.push({ + name, + city, + registrationCourt, + businessType: status, + }); + }); + + return records; + }); + + await page.close(); + + // Finally, we return an object, which triggers a JSON file download + return businessRecords; + } +} diff --git a/ts/classes.germanbusinessdata.ts b/ts/classes.jsonldata.ts similarity index 90% rename from ts/classes.germanbusinessdata.ts rename to ts/classes.jsonldata.ts index 00d657f..f934c59 100644 --- a/ts/classes.germanbusinessdata.ts +++ b/ts/classes.jsonldata.ts @@ -2,28 +2,23 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import type { OpenData } from './classes.main.opendata.js'; -export class GermanBusinessData { +export class JsonlDataProcessor { public openDataRef: OpenData; constructor(openDataRefArg: OpenData) { this.openDataRef = openDataRefArg; } - public async start() { - await this.update(); - } - public async stop() {} - - public async update() { + // TODO: define a mapper as argument instead of hard-coding it + public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') { const done = plugins.smartpromise.defer(); const promiseArray: Promise[] = []; - const dataUrl = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2'; const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir); if (!dataExists) { await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir); } else { } - const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrl); + const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg); promiseArray .push // smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl') @@ -60,7 +55,6 @@ export class GermanBusinessData { businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId(); businessRecord.data.name = entry.name; await businessRecord.save(); - // console.log(`stored ${businessRecord.data.name}`); } }, finalFunction: async (streamToolsArg) => { diff --git a/ts/classes.main.opendata.ts b/ts/classes.main.opendata.ts index 6115d6f..55a138d 100644 --- a/ts/classes.main.opendata.ts +++ b/ts/classes.main.opendata.ts @@ -1,13 +1,16 @@ import { BusinessRecord } from './classes.businessrecord.js'; -import { GermanBusinessData } from './classes.germanbusinessdata.js'; +import { HandelsRegister } from './classes.handelsregister.js'; +import { JsonlDataProcessor } from './classes.jsonldata.js'; import * as paths from './paths.js'; import * as plugins from './plugins.js'; export class OpenData { - db: plugins.smartdata.SmartdataDb; - germanBusinesses: GermanBusinessData; + public db: plugins.smartdata.SmartdataDb; private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir); + public jsonLDataProcessor: JsonlDataProcessor; + public handelsregister: HandelsRegister; + public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord); public async start() { @@ -18,8 +21,12 @@ export class OpenData { mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'), }); await this.db.init(); - this.germanBusinesses = new GermanBusinessData(this); - await this.germanBusinesses.start(); + this.jsonLDataProcessor = new JsonlDataProcessor(this); + this.handelsregister = new HandelsRegister(this); + await this.handelsregister.start(); + } + public async stop() { + await this.db.close(); + await this.handelsregister.stop(); } - public async stop() {} } \ No newline at end of file diff --git a/ts/plugins.ts b/ts/plugins.ts index 8008aae..82cadd5 100644 --- a/ts/plugins.ts +++ b/ts/plugins.ts @@ -8,6 +8,7 @@ export { // @push.rocks scope import * as qenv from '@push.rocks/qenv'; import * as smartarchive from '@push.rocks/smartarchive'; +import * as smartbrowser from '@push.rocks/smartbrowser'; import * as smartdata from '@push.rocks/smartdata'; import * as smartdelay from '@push.rocks/smartdelay'; import * as smartfile from '@push.rocks/smartfile'; @@ -19,6 +20,7 @@ import * as smartstream from '@push.rocks/smartstream'; export { qenv, smartarchive, + smartbrowser, smartdata, smartdelay, smartfile,