From 9c3f012da79c1ec1227ac2c1af5e74daeb1d7deb Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Sat, 4 Jan 2025 13:40:50 +0100 Subject: [PATCH] fix(core): Fix issues with JSONL data processing and improve error handling in business record validation --- changelog.md | 8 +++ npmextra.json | 3 + package.json | 2 + pnpm-lock.yaml | 11 ++++ readme.md | 20 ++++++- test/test.ts | 2 +- ts/00_commitinfo_data.ts | 2 +- ts/classes.businessrecord.ts | 10 ++++ ts/classes.jsonldata.ts | 106 ++++++++++++++++++++++------------- ts/classes.main.opendata.ts | 46 +++++++++++---- ts/plugins.ts | 4 ++ 11 files changed, 161 insertions(+), 53 deletions(-) diff --git a/changelog.md b/changelog.md index 3c5297b..6a6ab1b 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,13 @@ # Changelog +## 2025-01-04 - 1.4.1 - fix(core) +Fix issues with JSONL data processing and improve error handling in business record validation + +- Fixed JSONL data processing by adding concurrent processing for each JSON line to enhance performance. +- Added validation logic in BusinessRecord class to ensure that the mandatory fields are checked. +- Adjusted environment variable loading in OpenData class to ensure correct database initialization. +- Included missing dependencies and exports in the project files to ensure proper functionality. + ## 2025-01-04 - 1.4.0 - feat(HandelsRegister) Add file download functionality to HandelsRegister diff --git a/npmextra.json b/npmextra.json index a8ccf7c..770832d 100644 --- a/npmextra.json +++ b/npmextra.json @@ -30,5 +30,8 @@ "npmci": { "npmGlobalTools": [], "npmAccessLevel": "public" + }, + "tsdoc": { + "legal": "\n## License and Legal Information\n\nThis repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository. \n\n**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.\n\n### Trademarks\n\nThis project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.\n\n### Company Information\n\nTask Venture Capital GmbH \nRegistered at District court Bremen HRB 35230 HB, Germany\n\nFor any legal inquiries or if you require further information, please contact us via email at hello@task.vc.\n\nBy using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.\n" } } \ No newline at end of file diff --git a/package.json b/package.json index 3e51204..548ffb3 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "@push.rocks/lik": "^6.1.0", "@push.rocks/qenv": "^6.1.0", "@push.rocks/smartarchive": "^4.0.39", + "@push.rocks/smartarray": "^1.1.0", "@push.rocks/smartbrowser": "^2.0.8", "@push.rocks/smartdata": "^5.2.10", "@push.rocks/smartdelay": "^3.0.5", @@ -34,6 +35,7 @@ "@push.rocks/smartrequest": "^2.0.23", "@push.rocks/smartstream": "^3.2.5", "@push.rocks/smartunique": "^3.0.9", + "@push.rocks/smartxml": "^1.1.1", "@tsclass/tsclass": "^4.2.0" }, "repository": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c329c97..2e37c41 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: '@push.rocks/smartarchive': specifier: ^4.0.39 version: 4.0.39 + '@push.rocks/smartarray': + specifier: ^1.1.0 + version: 1.1.0 '@push.rocks/smartbrowser': specifier: ^2.0.8 version: 2.0.8 @@ -44,6 +47,9 @@ importers: '@push.rocks/smartunique': specifier: ^3.0.9 version: 3.0.9 + '@push.rocks/smartxml': + specifier: ^1.1.1 + version: 1.1.1 '@tsclass/tsclass': specifier: ^4.2.0 version: 4.2.0 @@ -747,6 +753,9 @@ packages: '@push.rocks/smartarchive@4.0.39': resolution: {integrity: sha512-e8xOOa7h4WlZMhjEd7IjAL/wgLBS3yJ6+Q7eZognHg1cNE/TOZ1kYrAN9eo8xmTtd+37hY9NXayk2JwXdXEvyA==} + '@push.rocks/smartarray@1.1.0': + resolution: {integrity: sha512-b5YgBmUdglOJH8zeUf2ZWdPCoqySgwvkycRi2BhA9zVZHkpASh39Ej0q0fxFJetlUVyYqGfVoMVjbVrLFfFV7g==} + '@push.rocks/smartbrowser@2.0.8': resolution: {integrity: sha512-0KWRZj3TuKo/sNwgPbiSE6WL+TMeR19t1JmXBZWh9n8iA2mpc4HhMrQAndEUdRCkx5ofSaHWojIRVFzGChj0Dg==} @@ -5364,6 +5373,8 @@ snapshots: tar-stream: 3.1.7 through: 2.3.8 + '@push.rocks/smartarray@1.1.0': {} + '@push.rocks/smartbrowser@2.0.8': dependencies: '@push.rocks/smartdelay': 3.0.5 diff --git a/readme.md b/readme.md index 926d57b..793e9e5 100644 --- a/readme.md +++ b/readme.md @@ -215,4 +215,22 @@ When working with business data, ensuring integrity and accuracy is crucial. Eac The `@fin.cx/opendata` module provides an extensive toolset for accessing and managing business data, particularly for companies based in Germany. Its functionalities include creating, updating, retrieving, and deleting business records, as well as keeping them current with the latest open data releases. This makes it an invaluable asset for developers aiming to integrate open data seamlessly into their systems, ensuring robust data management capabilities within their applications. Happy exploring and integrating open data into your projects! -undefined \ No newline at end of file + +## License and Legal Information + +This repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository. + +**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file. + +### Trademarks + +This project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH. + +### Company Information + +Task Venture Capital GmbH +Registered at District court Bremen HRB 35230 HB, Germany + +For any legal inquiries or if you require further information, please contact us via email at hello@task.vc. + +By using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works. \ No newline at end of file diff --git a/test/test.ts b/test/test.ts index ac0eb48..b55e57e 100644 --- a/test/test.ts +++ b/test/test.ts @@ -12,7 +12,7 @@ tap.test('should start the instance', async () => { await testOpenDataInstance.start(); }) -tap.skip.test('should build initial data', async () => { +tap.test('should build initial data', async () => { await testOpenDataInstance.buildInitialDb(); }); diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index f0f20c7..79fe2c6 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: '@fin.cx/opendata', - version: '1.4.0', + version: '1.4.1', description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.' } diff --git a/ts/classes.businessrecord.ts b/ts/classes.businessrecord.ts index cab89d1..dfef4f6 100644 --- a/ts/classes.businessrecord.ts +++ b/ts/classes.businessrecord.ts @@ -11,6 +11,9 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc< @plugins.smartdata.svDb() data: { name?: string; + startDate?: string; + endDate?: string; + status?: 'active' | 'liquidating' | 'closed'; address?: string; postalCode?: string; city?: string; @@ -42,4 +45,11 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc< purpose?: string; lastUpdate?: string; } = {}; + + /** + * validates the record against the Handelregister. + */ + public async validate() { + if (!this.data.name) throw new Error('Name is required.'); + } } diff --git a/ts/classes.jsonldata.ts b/ts/classes.jsonldata.ts index e728ad3..4158cb5 100644 --- a/ts/classes.jsonldata.ts +++ b/ts/classes.jsonldata.ts @@ -2,16 +2,55 @@ import * as plugins from './plugins.js'; import * as paths from './paths.js'; import type { OpenData } from './classes.main.opendata.js'; -export class JsonlDataProcessor { - public openDataRef: OpenData; - constructor(openDataRefArg: OpenData) { - this.openDataRef = openDataRefArg; +export type SeedEntryType = { + all_attributes: { + _registerArt: string; + _registerNummer: string; + additional_data: { + AD: boolean; + CD: boolean; + DK: boolean; + HD: boolean; + SI: boolean; + UT: boolean; + VĂ–: boolean; + }; + federal_state: string; + native_company_number: string; + registered_office: string; + registrar: string; + }; + company_number: string; + current_status: string; + jurisdiction_code: string; + name: string; + officers: { + name: string; + other_attributes: { + city: string; + firstname: string; + flag: string; + lastname: string; + }; + position: string; + start_date: string; // ISO 8601 date string + type: string; + }[]; + registered_address: string; + retrieved_at: string; // ISO 8601 date string +}; + +export class JsonlDataProcessor { + public forEachFunction: (entryArg: T) => Promise; + constructor(forEachFunctionArg: typeof this.forEachFunction) { + this.forEachFunction = forEachFunctionArg; } // TODO: define a mapper as argument instead of hard-coding it - public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') { + public async processDataFromUrl( + dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2' + ) { const done = plugins.smartpromise.defer(); - const promiseArray: Promise[] = []; const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir); if (!dataExists) { await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir); @@ -19,10 +58,6 @@ export class JsonlDataProcessor { } const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg); - promiseArray - .push - // smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl') - (); const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles(); let totalRecordsCounter = 0; let nextRest: string = ''; @@ -39,44 +74,37 @@ export class JsonlDataProcessor { const lines = currentString.split('\n'); nextRest = lines.pop(); console.log(`Got another ${lines.length} records.`); - for (const line of lines) { - let entry: any; - if (!line) continue; - try { - entry = JSON.parse(line); - console.log(JSON.stringify(entry, null, 2)); - process.exit(0); - } catch (err) { - console.log(line); - await plugins.smartdelay.delayFor(10000); - } - if (!entry) continue; - totalRecordsCounter++; - if (totalRecordsCounter % 10000 === 0) console.log(`${totalRecordsCounter} total records.`); - const businessRecord = new this.openDataRef.CBusinessRecord(); - businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId(); - businessRecord.data.name = entry.name; - await businessRecord.save(); - } + const concurrentProcessor = new plugins.smartarray.ConcurrentProcessor( + async (line) => { + let entry: T; + if (!line) return; + try { + entry = JSON.parse(line); + } catch (err) { + console.log(line); + await plugins.smartdelay.delayFor(10000); + } + if (!entry) return; + totalRecordsCounter++; + if (totalRecordsCounter % 10000 === 0) + console.log(`${totalRecordsCounter} total records.`); + await this.forEachFunction(entry); + }, + 1000 + ); + await concurrentProcessor.process(lines); }, finalFunction: async (streamToolsArg) => { console.log(`finished processing ${totalRecordsCounter} records.`); if (!nextRest) return; JSON.parse(nextRest); - } + done.resolve(); + }, }) ); }, }) ); - } - - public async getBusinessRecordByName(nameArg: string) { - const businessRecord = await this.openDataRef.CBusinessRecord.getInstance({ - data: { - name: { $regex: `${nameArg}`, $options: "i" } as any, - } - }); - return businessRecord; + await done.promise; } } diff --git a/ts/classes.main.opendata.ts b/ts/classes.main.opendata.ts index f2d1e50..d4af1b8 100644 --- a/ts/classes.main.opendata.ts +++ b/ts/classes.main.opendata.ts @@ -1,27 +1,37 @@ import { BusinessRecord } from './classes.businessrecord.js'; -import { HandelsRegister } from './classes.handelsregister.js'; -import { JsonlDataProcessor } from './classes.jsonldata.js'; +import { HandelsRegister } from './classes.handelsregister.js'; +import { JsonlDataProcessor, type SeedEntryType } from './classes.jsonldata.js'; import * as paths from './paths.js'; import * as plugins from './plugins.js'; export class OpenData { public db: plugins.smartdata.SmartdataDb; private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir); - - public jsonLDataProcessor: JsonlDataProcessor; + + public jsonLDataProcessor: JsonlDataProcessor; public handelsregister: HandelsRegister; - + public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord); public async start() { this.db = new plugins.smartdata.SmartdataDb({ - mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'), - mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'), - mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'), - mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'), + mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'), + mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'), + mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'), + mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'), }); await this.db.init(); - this.jsonLDataProcessor = new JsonlDataProcessor(this); + this.jsonLDataProcessor = new JsonlDataProcessor(async (entryArg) => { + const businessRecord = new this.CBusinessRecord(); + businessRecord.id = await this.CBusinessRecord.getNewId(); + businessRecord.data.name = entryArg.name; + businessRecord.data.germanParsedRegistration = { + court: entryArg.all_attributes.registered_office, + number: entryArg.all_attributes._registerNummer, + type: entryArg.all_attributes._registerArt as 'HRA' | 'HRB', + }; + await businessRecord.save(); + }); this.handelsregister = new HandelsRegister(this); await this.handelsregister.start(); } @@ -30,8 +40,22 @@ export class OpenData { await this.jsonLDataProcessor.processDataFromUrl(); } + public async slowValidateDb() { + + } + + public async validateSearchByName() { + + } + + public async searchDbByBusinessNameAndPostalCode(businessNameArg: string, postalCodeArg: string) { + + } + + + public async stop() { await this.db.close(); await this.handelsregister.stop(); } -} \ No newline at end of file +} diff --git a/ts/plugins.ts b/ts/plugins.ts index 608004a..063682f 100644 --- a/ts/plugins.ts +++ b/ts/plugins.ts @@ -9,6 +9,7 @@ export { import * as lik from '@push.rocks/lik'; import * as qenv from '@push.rocks/qenv'; import * as smartarchive from '@push.rocks/smartarchive'; +import * as smartarray from '@push.rocks/smartarray'; import * as smartbrowser from '@push.rocks/smartbrowser'; import * as smartdata from '@push.rocks/smartdata'; import * as smartdelay from '@push.rocks/smartdelay'; @@ -18,11 +19,13 @@ import * as smartpromise from '@push.rocks/smartpromise'; import * as smartrequest from '@push.rocks/smartrequest'; import * as smartstream from '@push.rocks/smartstream'; import * as smartunique from '@push.rocks/smartunique'; +import * as smartxml from '@push.rocks/smartxml'; export { lik, qenv, smartarchive, + smartarray, smartbrowser, smartdata, smartdelay, @@ -32,6 +35,7 @@ export { smartrequest, smartstream, smartunique, + smartxml, } // @tsclass scope