fix(core): Fix issues with JSONL data processing and improve error handling in business record validation

This commit is contained in:
Philipp Kunz 2025-01-04 13:40:50 +01:00
parent 8ebbc16bcd
commit 9c3f012da7
11 changed files with 161 additions and 53 deletions

View File

@ -1,5 +1,13 @@
# Changelog # Changelog
## 2025-01-04 - 1.4.1 - fix(core)
Fix issues with JSONL data processing and improve error handling in business record validation
- Fixed JSONL data processing by adding concurrent processing for each JSON line to enhance performance.
- Added validation logic in BusinessRecord class to ensure that the mandatory fields are checked.
- Adjusted environment variable loading in OpenData class to ensure correct database initialization.
- Included missing dependencies and exports in the project files to ensure proper functionality.
## 2025-01-04 - 1.4.0 - feat(HandelsRegister) ## 2025-01-04 - 1.4.0 - feat(HandelsRegister)
Add file download functionality to HandelsRegister Add file download functionality to HandelsRegister

View File

@ -30,5 +30,8 @@
"npmci": { "npmci": {
"npmGlobalTools": [], "npmGlobalTools": [],
"npmAccessLevel": "public" "npmAccessLevel": "public"
},
"tsdoc": {
"legal": "\n## License and Legal Information\n\nThis repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository. \n\n**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.\n\n### Trademarks\n\nThis project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.\n\n### Company Information\n\nTask Venture Capital GmbH \nRegistered at District court Bremen HRB 35230 HB, Germany\n\nFor any legal inquiries or if you require further information, please contact us via email at hello@task.vc.\n\nBy using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.\n"
} }
} }

View File

@ -25,6 +25,7 @@
"@push.rocks/lik": "^6.1.0", "@push.rocks/lik": "^6.1.0",
"@push.rocks/qenv": "^6.1.0", "@push.rocks/qenv": "^6.1.0",
"@push.rocks/smartarchive": "^4.0.39", "@push.rocks/smartarchive": "^4.0.39",
"@push.rocks/smartarray": "^1.1.0",
"@push.rocks/smartbrowser": "^2.0.8", "@push.rocks/smartbrowser": "^2.0.8",
"@push.rocks/smartdata": "^5.2.10", "@push.rocks/smartdata": "^5.2.10",
"@push.rocks/smartdelay": "^3.0.5", "@push.rocks/smartdelay": "^3.0.5",
@ -34,6 +35,7 @@
"@push.rocks/smartrequest": "^2.0.23", "@push.rocks/smartrequest": "^2.0.23",
"@push.rocks/smartstream": "^3.2.5", "@push.rocks/smartstream": "^3.2.5",
"@push.rocks/smartunique": "^3.0.9", "@push.rocks/smartunique": "^3.0.9",
"@push.rocks/smartxml": "^1.1.1",
"@tsclass/tsclass": "^4.2.0" "@tsclass/tsclass": "^4.2.0"
}, },
"repository": { "repository": {

11
pnpm-lock.yaml generated
View File

@ -17,6 +17,9 @@ importers:
'@push.rocks/smartarchive': '@push.rocks/smartarchive':
specifier: ^4.0.39 specifier: ^4.0.39
version: 4.0.39 version: 4.0.39
'@push.rocks/smartarray':
specifier: ^1.1.0
version: 1.1.0
'@push.rocks/smartbrowser': '@push.rocks/smartbrowser':
specifier: ^2.0.8 specifier: ^2.0.8
version: 2.0.8 version: 2.0.8
@ -44,6 +47,9 @@ importers:
'@push.rocks/smartunique': '@push.rocks/smartunique':
specifier: ^3.0.9 specifier: ^3.0.9
version: 3.0.9 version: 3.0.9
'@push.rocks/smartxml':
specifier: ^1.1.1
version: 1.1.1
'@tsclass/tsclass': '@tsclass/tsclass':
specifier: ^4.2.0 specifier: ^4.2.0
version: 4.2.0 version: 4.2.0
@ -747,6 +753,9 @@ packages:
'@push.rocks/smartarchive@4.0.39': '@push.rocks/smartarchive@4.0.39':
resolution: {integrity: sha512-e8xOOa7h4WlZMhjEd7IjAL/wgLBS3yJ6+Q7eZognHg1cNE/TOZ1kYrAN9eo8xmTtd+37hY9NXayk2JwXdXEvyA==} resolution: {integrity: sha512-e8xOOa7h4WlZMhjEd7IjAL/wgLBS3yJ6+Q7eZognHg1cNE/TOZ1kYrAN9eo8xmTtd+37hY9NXayk2JwXdXEvyA==}
'@push.rocks/smartarray@1.1.0':
resolution: {integrity: sha512-b5YgBmUdglOJH8zeUf2ZWdPCoqySgwvkycRi2BhA9zVZHkpASh39Ej0q0fxFJetlUVyYqGfVoMVjbVrLFfFV7g==}
'@push.rocks/smartbrowser@2.0.8': '@push.rocks/smartbrowser@2.0.8':
resolution: {integrity: sha512-0KWRZj3TuKo/sNwgPbiSE6WL+TMeR19t1JmXBZWh9n8iA2mpc4HhMrQAndEUdRCkx5ofSaHWojIRVFzGChj0Dg==} resolution: {integrity: sha512-0KWRZj3TuKo/sNwgPbiSE6WL+TMeR19t1JmXBZWh9n8iA2mpc4HhMrQAndEUdRCkx5ofSaHWojIRVFzGChj0Dg==}
@ -5364,6 +5373,8 @@ snapshots:
tar-stream: 3.1.7 tar-stream: 3.1.7
through: 2.3.8 through: 2.3.8
'@push.rocks/smartarray@1.1.0': {}
'@push.rocks/smartbrowser@2.0.8': '@push.rocks/smartbrowser@2.0.8':
dependencies: dependencies:
'@push.rocks/smartdelay': 3.0.5 '@push.rocks/smartdelay': 3.0.5

View File

@ -215,4 +215,22 @@ When working with business data, ensuring integrity and accuracy is crucial. Eac
The `@fin.cx/opendata` module provides an extensive toolset for accessing and managing business data, particularly for companies based in Germany. Its functionalities include creating, updating, retrieving, and deleting business records, as well as keeping them current with the latest open data releases. This makes it an invaluable asset for developers aiming to integrate open data seamlessly into their systems, ensuring robust data management capabilities within their applications. The `@fin.cx/opendata` module provides an extensive toolset for accessing and managing business data, particularly for companies based in Germany. Its functionalities include creating, updating, retrieving, and deleting business records, as well as keeping them current with the latest open data releases. This makes it an invaluable asset for developers aiming to integrate open data seamlessly into their systems, ensuring robust data management capabilities within their applications.
Happy exploring and integrating open data into your projects! Happy exploring and integrating open data into your projects!
undefined
## License and Legal Information
This repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository.
**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.
### Trademarks
This project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.
### Company Information
Task Venture Capital GmbH
Registered at District court Bremen HRB 35230 HB, Germany
For any legal inquiries or if you require further information, please contact us via email at hello@task.vc.
By using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.

View File

@ -12,7 +12,7 @@ tap.test('should start the instance', async () => {
await testOpenDataInstance.start(); await testOpenDataInstance.start();
}) })
tap.skip.test('should build initial data', async () => { tap.test('should build initial data', async () => {
await testOpenDataInstance.buildInitialDb(); await testOpenDataInstance.buildInitialDb();
}); });

View File

@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: '@fin.cx/opendata', name: '@fin.cx/opendata',
version: '1.4.0', version: '1.4.1',
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.' description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
} }

View File

@ -11,6 +11,9 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
@plugins.smartdata.svDb() @plugins.smartdata.svDb()
data: { data: {
name?: string; name?: string;
startDate?: string;
endDate?: string;
status?: 'active' | 'liquidating' | 'closed';
address?: string; address?: string;
postalCode?: string; postalCode?: string;
city?: string; city?: string;
@ -42,4 +45,11 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
purpose?: string; purpose?: string;
lastUpdate?: string; lastUpdate?: string;
} = {}; } = {};
/**
* validates the record against the Handelregister.
*/
public async validate() {
if (!this.data.name) throw new Error('Name is required.');
}
} }

View File

@ -2,16 +2,55 @@ import * as plugins from './plugins.js';
import * as paths from './paths.js'; import * as paths from './paths.js';
import type { OpenData } from './classes.main.opendata.js'; import type { OpenData } from './classes.main.opendata.js';
export class JsonlDataProcessor { export type SeedEntryType = {
public openDataRef: OpenData; all_attributes: {
constructor(openDataRefArg: OpenData) { _registerArt: string;
this.openDataRef = openDataRefArg; _registerNummer: string;
additional_data: {
AD: boolean;
CD: boolean;
DK: boolean;
HD: boolean;
SI: boolean;
UT: boolean;
: boolean;
};
federal_state: string;
native_company_number: string;
registered_office: string;
registrar: string;
};
company_number: string;
current_status: string;
jurisdiction_code: string;
name: string;
officers: {
name: string;
other_attributes: {
city: string;
firstname: string;
flag: string;
lastname: string;
};
position: string;
start_date: string; // ISO 8601 date string
type: string;
}[];
registered_address: string;
retrieved_at: string; // ISO 8601 date string
};
export class JsonlDataProcessor<T> {
public forEachFunction: (entryArg: T) => Promise<void>;
constructor(forEachFunctionArg: typeof this.forEachFunction) {
this.forEachFunction = forEachFunctionArg;
} }
// TODO: define a mapper as argument instead of hard-coding it // TODO: define a mapper as argument instead of hard-coding it
public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') { public async processDataFromUrl(
dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2'
) {
const done = plugins.smartpromise.defer(); const done = plugins.smartpromise.defer();
const promiseArray: Promise<any>[] = [];
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir); const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
if (!dataExists) { if (!dataExists) {
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir); await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
@ -19,10 +58,6 @@ export class JsonlDataProcessor {
} }
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg); const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg);
promiseArray
.push
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
();
const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles(); const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles();
let totalRecordsCounter = 0; let totalRecordsCounter = 0;
let nextRest: string = ''; let nextRest: string = '';
@ -39,44 +74,37 @@ export class JsonlDataProcessor {
const lines = currentString.split('\n'); const lines = currentString.split('\n');
nextRest = lines.pop(); nextRest = lines.pop();
console.log(`Got another ${lines.length} records.`); console.log(`Got another ${lines.length} records.`);
for (const line of lines) { const concurrentProcessor = new plugins.smartarray.ConcurrentProcessor<string>(
let entry: any; async (line) => {
if (!line) continue; let entry: T;
try { if (!line) return;
entry = JSON.parse(line); try {
console.log(JSON.stringify(entry, null, 2)); entry = JSON.parse(line);
process.exit(0); } catch (err) {
} catch (err) { console.log(line);
console.log(line); await plugins.smartdelay.delayFor(10000);
await plugins.smartdelay.delayFor(10000); }
} if (!entry) return;
if (!entry) continue; totalRecordsCounter++;
totalRecordsCounter++; if (totalRecordsCounter % 10000 === 0)
if (totalRecordsCounter % 10000 === 0) console.log(`${totalRecordsCounter} total records.`); console.log(`${totalRecordsCounter} total records.`);
const businessRecord = new this.openDataRef.CBusinessRecord(); await this.forEachFunction(entry);
businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId(); },
businessRecord.data.name = entry.name; 1000
await businessRecord.save(); );
} await concurrentProcessor.process(lines);
}, },
finalFunction: async (streamToolsArg) => { finalFunction: async (streamToolsArg) => {
console.log(`finished processing ${totalRecordsCounter} records.`); console.log(`finished processing ${totalRecordsCounter} records.`);
if (!nextRest) return; if (!nextRest) return;
JSON.parse(nextRest); JSON.parse(nextRest);
} done.resolve();
},
}) })
); );
}, },
}) })
); );
} await done.promise;
public async getBusinessRecordByName(nameArg: string) {
const businessRecord = await this.openDataRef.CBusinessRecord.getInstance({
data: {
name: { $regex: `${nameArg}`, $options: "i" } as any,
}
});
return businessRecord;
} }
} }

View File

@ -1,6 +1,6 @@
import { BusinessRecord } from './classes.businessrecord.js'; import { BusinessRecord } from './classes.businessrecord.js';
import { HandelsRegister } from './classes.handelsregister.js'; import { HandelsRegister } from './classes.handelsregister.js';
import { JsonlDataProcessor } from './classes.jsonldata.js'; import { JsonlDataProcessor, type SeedEntryType } from './classes.jsonldata.js';
import * as paths from './paths.js'; import * as paths from './paths.js';
import * as plugins from './plugins.js'; import * as plugins from './plugins.js';
@ -8,20 +8,30 @@ export class OpenData {
public db: plugins.smartdata.SmartdataDb; public db: plugins.smartdata.SmartdataDb;
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir); private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
public jsonLDataProcessor: JsonlDataProcessor; public jsonLDataProcessor: JsonlDataProcessor<SeedEntryType>;
public handelsregister: HandelsRegister; public handelsregister: HandelsRegister;
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord); public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
public async start() { public async start() {
this.db = new plugins.smartdata.SmartdataDb({ this.db = new plugins.smartdata.SmartdataDb({
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'), mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'), mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'), mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'), mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
}); });
await this.db.init(); await this.db.init();
this.jsonLDataProcessor = new JsonlDataProcessor(this); this.jsonLDataProcessor = new JsonlDataProcessor(async (entryArg) => {
const businessRecord = new this.CBusinessRecord();
businessRecord.id = await this.CBusinessRecord.getNewId();
businessRecord.data.name = entryArg.name;
businessRecord.data.germanParsedRegistration = {
court: entryArg.all_attributes.registered_office,
number: entryArg.all_attributes._registerNummer,
type: entryArg.all_attributes._registerArt as 'HRA' | 'HRB',
};
await businessRecord.save();
});
this.handelsregister = new HandelsRegister(this); this.handelsregister = new HandelsRegister(this);
await this.handelsregister.start(); await this.handelsregister.start();
} }
@ -30,6 +40,20 @@ export class OpenData {
await this.jsonLDataProcessor.processDataFromUrl(); await this.jsonLDataProcessor.processDataFromUrl();
} }
public async slowValidateDb() {
}
public async validateSearchByName() {
}
public async searchDbByBusinessNameAndPostalCode(businessNameArg: string, postalCodeArg: string) {
}
public async stop() { public async stop() {
await this.db.close(); await this.db.close();
await this.handelsregister.stop(); await this.handelsregister.stop();

View File

@ -9,6 +9,7 @@ export {
import * as lik from '@push.rocks/lik'; import * as lik from '@push.rocks/lik';
import * as qenv from '@push.rocks/qenv'; import * as qenv from '@push.rocks/qenv';
import * as smartarchive from '@push.rocks/smartarchive'; import * as smartarchive from '@push.rocks/smartarchive';
import * as smartarray from '@push.rocks/smartarray';
import * as smartbrowser from '@push.rocks/smartbrowser'; import * as smartbrowser from '@push.rocks/smartbrowser';
import * as smartdata from '@push.rocks/smartdata'; import * as smartdata from '@push.rocks/smartdata';
import * as smartdelay from '@push.rocks/smartdelay'; import * as smartdelay from '@push.rocks/smartdelay';
@ -18,11 +19,13 @@ import * as smartpromise from '@push.rocks/smartpromise';
import * as smartrequest from '@push.rocks/smartrequest'; import * as smartrequest from '@push.rocks/smartrequest';
import * as smartstream from '@push.rocks/smartstream'; import * as smartstream from '@push.rocks/smartstream';
import * as smartunique from '@push.rocks/smartunique'; import * as smartunique from '@push.rocks/smartunique';
import * as smartxml from '@push.rocks/smartxml';
export { export {
lik, lik,
qenv, qenv,
smartarchive, smartarchive,
smartarray,
smartbrowser, smartbrowser,
smartdata, smartdata,
smartdelay, smartdelay,
@ -32,6 +35,7 @@ export {
smartrequest, smartrequest,
smartstream, smartstream,
smartunique, smartunique,
smartxml,
} }
// @tsclass scope // @tsclass scope