feat(core): Integrate Handelsregister search for company data retrieval
This commit is contained in:
parent
506a644c6b
commit
1eda50ad13
@ -1,5 +1,12 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2025-01-02 - 1.2.0 - feat(core)
|
||||||
|
Integrate Handelsregister search for company data retrieval
|
||||||
|
|
||||||
|
- Added support for searching company data via Handelsregister.
|
||||||
|
- Replaced GermanBusinessData functionality with JsonlDataProcessor.
|
||||||
|
- Included smartbrowser dependency for handling web requests to Handelsregister.
|
||||||
|
|
||||||
## 2025-01-01 - 1.1.5 - fix(GermanBusinessData)
|
## 2025-01-01 - 1.1.5 - fix(GermanBusinessData)
|
||||||
Add console log for total records processed at the end of the stream.
|
Add console log for total records processed at the end of the stream.
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@push.rocks/qenv": "^6.1.0",
|
"@push.rocks/qenv": "^6.1.0",
|
||||||
"@push.rocks/smartarchive": "^4.0.39",
|
"@push.rocks/smartarchive": "^4.0.39",
|
||||||
|
"@push.rocks/smartbrowser": "^2.0.6",
|
||||||
"@push.rocks/smartdata": "^5.2.10",
|
"@push.rocks/smartdata": "^5.2.10",
|
||||||
"@push.rocks/smartdelay": "^3.0.5",
|
"@push.rocks/smartdelay": "^3.0.5",
|
||||||
"@push.rocks/smartfile": "^11.0.23",
|
"@push.rocks/smartfile": "^11.0.23",
|
||||||
|
5
pnpm-lock.yaml
generated
5
pnpm-lock.yaml
generated
@ -14,6 +14,9 @@ importers:
|
|||||||
'@push.rocks/smartarchive':
|
'@push.rocks/smartarchive':
|
||||||
specifier: ^4.0.39
|
specifier: ^4.0.39
|
||||||
version: 4.0.39
|
version: 4.0.39
|
||||||
|
'@push.rocks/smartbrowser':
|
||||||
|
specifier: ^2.0.6
|
||||||
|
version: 2.0.6
|
||||||
'@push.rocks/smartdata':
|
'@push.rocks/smartdata':
|
||||||
specifier: ^5.2.10
|
specifier: ^5.2.10
|
||||||
version: 5.2.10(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0))(@aws-sdk/credential-providers@3.716.0(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0)))(socks@2.8.3)
|
version: 5.2.10(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0))(@aws-sdk/credential-providers@3.716.0(@aws-sdk/client-sso-oidc@3.716.0(@aws-sdk/client-sts@3.716.0)))(socks@2.8.3)
|
||||||
@ -7496,7 +7499,7 @@ snapshots:
|
|||||||
|
|
||||||
extract-zip@2.0.1:
|
extract-zip@2.0.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
debug: 4.3.4
|
debug: 4.4.0
|
||||||
get-stream: 5.2.0
|
get-stream: 5.2.0
|
||||||
yauzl: 2.10.0
|
yauzl: 2.10.0
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
|
10
test/test.ts
10
test/test.ts
@ -12,4 +12,14 @@ tap.test('should start the instance', async () => {
|
|||||||
await testOpenDataInstance.start();
|
await testOpenDataInstance.start();
|
||||||
})
|
})
|
||||||
|
|
||||||
|
tap.test('should get the data for a company', async () => {
|
||||||
|
const result = await testOpenDataInstance.handelsregister.getDataForCompany('Volkswagen');
|
||||||
|
console.log(result);
|
||||||
|
});
|
||||||
|
|
||||||
|
tap.test('should stop the instance', async () => {
|
||||||
|
await testOpenDataInstance.stop();
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
tap.start()
|
tap.start()
|
||||||
|
@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: '@fin.cx/opendata',
|
name: '@fin.cx/opendata',
|
||||||
version: '1.1.5',
|
version: '1.2.0',
|
||||||
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
|
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
|
||||||
}
|
}
|
||||||
|
138
ts/classes.handelsregister.ts
Normal file
138
ts/classes.handelsregister.ts
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import type { OpenData } from './classes.main.opendata.js';
|
||||||
|
import * as plugins from './plugins.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the HandlesRegister exposed as a class
|
||||||
|
*/
|
||||||
|
export class HandelsRegister {
|
||||||
|
private openDataRef: OpenData;
|
||||||
|
|
||||||
|
public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser();
|
||||||
|
|
||||||
|
constructor(openDataRef: OpenData) {
|
||||||
|
this.openDataRef = openDataRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async start() {
|
||||||
|
await this.smartbrowserInstance.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
public async stop() {
|
||||||
|
await this.smartbrowserInstance.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search for a company by name
|
||||||
|
*/
|
||||||
|
public async getDataForCompany(companyNameArg: string) {
|
||||||
|
const page = await this.smartbrowserInstance.headlessBrowser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
await page.goto('https://www.handelsregister.de/');
|
||||||
|
await page.evaluate(() => {
|
||||||
|
const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span'));
|
||||||
|
const targetElement = elements.find((el) => el.textContent?.trim() === 'Normal search');
|
||||||
|
if (targetElement) {
|
||||||
|
(targetElement as HTMLElement).click();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Wait for the textarea to appear
|
||||||
|
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||||
|
|
||||||
|
// Enter text into the textarea using page.evaluate
|
||||||
|
const inputText = companyNameArg;
|
||||||
|
await page.evaluate((text) => {
|
||||||
|
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
|
||||||
|
if (textarea) {
|
||||||
|
textarea.value = text; // Set the value
|
||||||
|
// Trigger the change event manually if required
|
||||||
|
const event = new Event('change', { bubbles: true });
|
||||||
|
textarea.dispatchEvent(event);
|
||||||
|
}
|
||||||
|
}, inputText);
|
||||||
|
|
||||||
|
console.log('Text entered successfully!');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to find or enter text into the textarea:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Wait for the radio button's label to appear
|
||||||
|
await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 });
|
||||||
|
|
||||||
|
// Click the label to select the radio button
|
||||||
|
await page.evaluate(() => {
|
||||||
|
const label = document.querySelector<HTMLLabelElement>(
|
||||||
|
'label[for="form:schlagwortOptionen:0"]'
|
||||||
|
);
|
||||||
|
if (label) {
|
||||||
|
label.click();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Radio button clicked successfully!');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to find or click the radio button:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Wait for the button with the text "Find" to appear
|
||||||
|
await page.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 });
|
||||||
|
|
||||||
|
// Locate and click the button using its text
|
||||||
|
await page.evaluate(() => {
|
||||||
|
const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c'));
|
||||||
|
const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find');
|
||||||
|
if (targetButton) {
|
||||||
|
const parentButton = targetButton.closest('button') || targetButton;
|
||||||
|
(parentButton as HTMLElement).click();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Find button clicked successfully!');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to find or click the "Find" button:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
|
||||||
|
timeout: 10000,
|
||||||
|
});
|
||||||
|
|
||||||
|
const businessRecords = await page.evaluate(() => {
|
||||||
|
const rows = document.querySelectorAll(
|
||||||
|
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr'
|
||||||
|
);
|
||||||
|
const records = [];
|
||||||
|
|
||||||
|
rows.forEach((row) => {
|
||||||
|
const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20');
|
||||||
|
const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span');
|
||||||
|
const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText');
|
||||||
|
const registrationCourtElement = row.querySelector(
|
||||||
|
'td.ui-panelgrid-cell.fontTableNameSize'
|
||||||
|
);
|
||||||
|
|
||||||
|
const name = nameElement?.textContent?.trim();
|
||||||
|
const city = cityElement?.textContent?.trim();
|
||||||
|
const status = statusElement?.textContent?.trim();
|
||||||
|
const registrationCourt = registrationCourtElement?.textContent?.trim();
|
||||||
|
|
||||||
|
// Push parsed data into records array
|
||||||
|
records.push({
|
||||||
|
name,
|
||||||
|
city,
|
||||||
|
registrationCourt,
|
||||||
|
businessType: status,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return records;
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.close();
|
||||||
|
|
||||||
|
// Finally, we return an object, which triggers a JSON file download
|
||||||
|
return businessRecords;
|
||||||
|
}
|
||||||
|
}
|
@ -2,28 +2,23 @@ import * as plugins from './plugins.js';
|
|||||||
import * as paths from './paths.js';
|
import * as paths from './paths.js';
|
||||||
import type { OpenData } from './classes.main.opendata.js';
|
import type { OpenData } from './classes.main.opendata.js';
|
||||||
|
|
||||||
export class GermanBusinessData {
|
export class JsonlDataProcessor {
|
||||||
public openDataRef: OpenData;
|
public openDataRef: OpenData;
|
||||||
constructor(openDataRefArg: OpenData) {
|
constructor(openDataRefArg: OpenData) {
|
||||||
this.openDataRef = openDataRefArg;
|
this.openDataRef = openDataRefArg;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async start() {
|
// TODO: define a mapper as argument instead of hard-coding it
|
||||||
await this.update();
|
public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') {
|
||||||
}
|
|
||||||
public async stop() {}
|
|
||||||
|
|
||||||
public async update() {
|
|
||||||
const done = plugins.smartpromise.defer();
|
const done = plugins.smartpromise.defer();
|
||||||
const promiseArray: Promise<any>[] = [];
|
const promiseArray: Promise<any>[] = [];
|
||||||
const dataUrl = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2';
|
|
||||||
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
||||||
if (!dataExists) {
|
if (!dataExists) {
|
||||||
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
||||||
} else {
|
} else {
|
||||||
}
|
}
|
||||||
|
|
||||||
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrl);
|
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg);
|
||||||
promiseArray
|
promiseArray
|
||||||
.push
|
.push
|
||||||
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
|
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
|
||||||
@ -60,7 +55,6 @@ export class GermanBusinessData {
|
|||||||
businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId();
|
businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId();
|
||||||
businessRecord.data.name = entry.name;
|
businessRecord.data.name = entry.name;
|
||||||
await businessRecord.save();
|
await businessRecord.save();
|
||||||
// console.log(`stored ${businessRecord.data.name}`);
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
finalFunction: async (streamToolsArg) => {
|
finalFunction: async (streamToolsArg) => {
|
@ -1,13 +1,16 @@
|
|||||||
import { BusinessRecord } from './classes.businessrecord.js';
|
import { BusinessRecord } from './classes.businessrecord.js';
|
||||||
import { GermanBusinessData } from './classes.germanbusinessdata.js';
|
import { HandelsRegister } from './classes.handelsregister.js';
|
||||||
|
import { JsonlDataProcessor } from './classes.jsonldata.js';
|
||||||
import * as paths from './paths.js';
|
import * as paths from './paths.js';
|
||||||
import * as plugins from './plugins.js';
|
import * as plugins from './plugins.js';
|
||||||
|
|
||||||
export class OpenData {
|
export class OpenData {
|
||||||
db: plugins.smartdata.SmartdataDb;
|
public db: plugins.smartdata.SmartdataDb;
|
||||||
germanBusinesses: GermanBusinessData;
|
|
||||||
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
|
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
|
||||||
|
|
||||||
|
public jsonLDataProcessor: JsonlDataProcessor;
|
||||||
|
public handelsregister: HandelsRegister;
|
||||||
|
|
||||||
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
|
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
|
||||||
|
|
||||||
public async start() {
|
public async start() {
|
||||||
@ -18,8 +21,12 @@ export class OpenData {
|
|||||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||||
});
|
});
|
||||||
await this.db.init();
|
await this.db.init();
|
||||||
this.germanBusinesses = new GermanBusinessData(this);
|
this.jsonLDataProcessor = new JsonlDataProcessor(this);
|
||||||
await this.germanBusinesses.start();
|
this.handelsregister = new HandelsRegister(this);
|
||||||
|
await this.handelsregister.start();
|
||||||
|
}
|
||||||
|
public async stop() {
|
||||||
|
await this.db.close();
|
||||||
|
await this.handelsregister.stop();
|
||||||
}
|
}
|
||||||
public async stop() {}
|
|
||||||
}
|
}
|
@ -8,6 +8,7 @@ export {
|
|||||||
// @push.rocks scope
|
// @push.rocks scope
|
||||||
import * as qenv from '@push.rocks/qenv';
|
import * as qenv from '@push.rocks/qenv';
|
||||||
import * as smartarchive from '@push.rocks/smartarchive';
|
import * as smartarchive from '@push.rocks/smartarchive';
|
||||||
|
import * as smartbrowser from '@push.rocks/smartbrowser';
|
||||||
import * as smartdata from '@push.rocks/smartdata';
|
import * as smartdata from '@push.rocks/smartdata';
|
||||||
import * as smartdelay from '@push.rocks/smartdelay';
|
import * as smartdelay from '@push.rocks/smartdelay';
|
||||||
import * as smartfile from '@push.rocks/smartfile';
|
import * as smartfile from '@push.rocks/smartfile';
|
||||||
@ -19,6 +20,7 @@ import * as smartstream from '@push.rocks/smartstream';
|
|||||||
export {
|
export {
|
||||||
qenv,
|
qenv,
|
||||||
smartarchive,
|
smartarchive,
|
||||||
|
smartbrowser,
|
||||||
smartdata,
|
smartdata,
|
||||||
smartdelay,
|
smartdelay,
|
||||||
smartfile,
|
smartfile,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user