fix(core): Fix issues with JSONL data processing and improve error handling in business record validation
This commit is contained in:
@ -3,6 +3,6 @@
|
||||
*/
|
||||
export const commitinfo = {
|
||||
name: '@fin.cx/opendata',
|
||||
version: '1.4.0',
|
||||
version: '1.4.1',
|
||||
description: 'A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.'
|
||||
}
|
||||
|
@ -11,6 +11,9 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
|
||||
@plugins.smartdata.svDb()
|
||||
data: {
|
||||
name?: string;
|
||||
startDate?: string;
|
||||
endDate?: string;
|
||||
status?: 'active' | 'liquidating' | 'closed';
|
||||
address?: string;
|
||||
postalCode?: string;
|
||||
city?: string;
|
||||
@ -42,4 +45,11 @@ export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
|
||||
purpose?: string;
|
||||
lastUpdate?: string;
|
||||
} = {};
|
||||
|
||||
/**
|
||||
* validates the record against the Handelregister.
|
||||
*/
|
||||
public async validate() {
|
||||
if (!this.data.name) throw new Error('Name is required.');
|
||||
}
|
||||
}
|
||||
|
@ -2,16 +2,55 @@ import * as plugins from './plugins.js';
|
||||
import * as paths from './paths.js';
|
||||
import type { OpenData } from './classes.main.opendata.js';
|
||||
|
||||
export class JsonlDataProcessor {
|
||||
public openDataRef: OpenData;
|
||||
constructor(openDataRefArg: OpenData) {
|
||||
this.openDataRef = openDataRefArg;
|
||||
export type SeedEntryType = {
|
||||
all_attributes: {
|
||||
_registerArt: string;
|
||||
_registerNummer: string;
|
||||
additional_data: {
|
||||
AD: boolean;
|
||||
CD: boolean;
|
||||
DK: boolean;
|
||||
HD: boolean;
|
||||
SI: boolean;
|
||||
UT: boolean;
|
||||
VÖ: boolean;
|
||||
};
|
||||
federal_state: string;
|
||||
native_company_number: string;
|
||||
registered_office: string;
|
||||
registrar: string;
|
||||
};
|
||||
company_number: string;
|
||||
current_status: string;
|
||||
jurisdiction_code: string;
|
||||
name: string;
|
||||
officers: {
|
||||
name: string;
|
||||
other_attributes: {
|
||||
city: string;
|
||||
firstname: string;
|
||||
flag: string;
|
||||
lastname: string;
|
||||
};
|
||||
position: string;
|
||||
start_date: string; // ISO 8601 date string
|
||||
type: string;
|
||||
}[];
|
||||
registered_address: string;
|
||||
retrieved_at: string; // ISO 8601 date string
|
||||
};
|
||||
|
||||
export class JsonlDataProcessor<T> {
|
||||
public forEachFunction: (entryArg: T) => Promise<void>;
|
||||
constructor(forEachFunctionArg: typeof this.forEachFunction) {
|
||||
this.forEachFunction = forEachFunctionArg;
|
||||
}
|
||||
|
||||
// TODO: define a mapper as argument instead of hard-coding it
|
||||
public async processDataFromUrl(dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2') {
|
||||
public async processDataFromUrl(
|
||||
dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2'
|
||||
) {
|
||||
const done = plugins.smartpromise.defer();
|
||||
const promiseArray: Promise<any>[] = [];
|
||||
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
||||
if (!dataExists) {
|
||||
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
||||
@ -19,10 +58,6 @@ export class JsonlDataProcessor {
|
||||
}
|
||||
|
||||
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg);
|
||||
promiseArray
|
||||
.push
|
||||
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
|
||||
();
|
||||
const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles();
|
||||
let totalRecordsCounter = 0;
|
||||
let nextRest: string = '';
|
||||
@ -39,44 +74,37 @@ export class JsonlDataProcessor {
|
||||
const lines = currentString.split('\n');
|
||||
nextRest = lines.pop();
|
||||
console.log(`Got another ${lines.length} records.`);
|
||||
for (const line of lines) {
|
||||
let entry: any;
|
||||
if (!line) continue;
|
||||
try {
|
||||
entry = JSON.parse(line);
|
||||
console.log(JSON.stringify(entry, null, 2));
|
||||
process.exit(0);
|
||||
} catch (err) {
|
||||
console.log(line);
|
||||
await plugins.smartdelay.delayFor(10000);
|
||||
}
|
||||
if (!entry) continue;
|
||||
totalRecordsCounter++;
|
||||
if (totalRecordsCounter % 10000 === 0) console.log(`${totalRecordsCounter} total records.`);
|
||||
const businessRecord = new this.openDataRef.CBusinessRecord();
|
||||
businessRecord.id = await this.openDataRef.CBusinessRecord.getNewId();
|
||||
businessRecord.data.name = entry.name;
|
||||
await businessRecord.save();
|
||||
}
|
||||
const concurrentProcessor = new plugins.smartarray.ConcurrentProcessor<string>(
|
||||
async (line) => {
|
||||
let entry: T;
|
||||
if (!line) return;
|
||||
try {
|
||||
entry = JSON.parse(line);
|
||||
} catch (err) {
|
||||
console.log(line);
|
||||
await plugins.smartdelay.delayFor(10000);
|
||||
}
|
||||
if (!entry) return;
|
||||
totalRecordsCounter++;
|
||||
if (totalRecordsCounter % 10000 === 0)
|
||||
console.log(`${totalRecordsCounter} total records.`);
|
||||
await this.forEachFunction(entry);
|
||||
},
|
||||
1000
|
||||
);
|
||||
await concurrentProcessor.process(lines);
|
||||
},
|
||||
finalFunction: async (streamToolsArg) => {
|
||||
console.log(`finished processing ${totalRecordsCounter} records.`);
|
||||
if (!nextRest) return;
|
||||
JSON.parse(nextRest);
|
||||
}
|
||||
done.resolve();
|
||||
},
|
||||
})
|
||||
);
|
||||
},
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async getBusinessRecordByName(nameArg: string) {
|
||||
const businessRecord = await this.openDataRef.CBusinessRecord.getInstance({
|
||||
data: {
|
||||
name: { $regex: `${nameArg}`, $options: "i" } as any,
|
||||
}
|
||||
});
|
||||
return businessRecord;
|
||||
await done.promise;
|
||||
}
|
||||
}
|
||||
|
@ -1,27 +1,37 @@
|
||||
import { BusinessRecord } from './classes.businessrecord.js';
|
||||
import { HandelsRegister } from './classes.handelsregister.js';
|
||||
import { JsonlDataProcessor } from './classes.jsonldata.js';
|
||||
import { HandelsRegister } from './classes.handelsregister.js';
|
||||
import { JsonlDataProcessor, type SeedEntryType } from './classes.jsonldata.js';
|
||||
import * as paths from './paths.js';
|
||||
import * as plugins from './plugins.js';
|
||||
|
||||
export class OpenData {
|
||||
public db: plugins.smartdata.SmartdataDb;
|
||||
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
|
||||
|
||||
public jsonLDataProcessor: JsonlDataProcessor;
|
||||
|
||||
public jsonLDataProcessor: JsonlDataProcessor<SeedEntryType>;
|
||||
public handelsregister: HandelsRegister;
|
||||
|
||||
|
||||
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
|
||||
|
||||
public async start() {
|
||||
this.db = new plugins.smartdata.SmartdataDb({
|
||||
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
|
||||
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
|
||||
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
|
||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
|
||||
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
|
||||
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
|
||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||
});
|
||||
await this.db.init();
|
||||
this.jsonLDataProcessor = new JsonlDataProcessor(this);
|
||||
this.jsonLDataProcessor = new JsonlDataProcessor(async (entryArg) => {
|
||||
const businessRecord = new this.CBusinessRecord();
|
||||
businessRecord.id = await this.CBusinessRecord.getNewId();
|
||||
businessRecord.data.name = entryArg.name;
|
||||
businessRecord.data.germanParsedRegistration = {
|
||||
court: entryArg.all_attributes.registered_office,
|
||||
number: entryArg.all_attributes._registerNummer,
|
||||
type: entryArg.all_attributes._registerArt as 'HRA' | 'HRB',
|
||||
};
|
||||
await businessRecord.save();
|
||||
});
|
||||
this.handelsregister = new HandelsRegister(this);
|
||||
await this.handelsregister.start();
|
||||
}
|
||||
@ -30,8 +40,22 @@ export class OpenData {
|
||||
await this.jsonLDataProcessor.processDataFromUrl();
|
||||
}
|
||||
|
||||
public async slowValidateDb() {
|
||||
|
||||
}
|
||||
|
||||
public async validateSearchByName() {
|
||||
|
||||
}
|
||||
|
||||
public async searchDbByBusinessNameAndPostalCode(businessNameArg: string, postalCodeArg: string) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public async stop() {
|
||||
await this.db.close();
|
||||
await this.handelsregister.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ export {
|
||||
import * as lik from '@push.rocks/lik';
|
||||
import * as qenv from '@push.rocks/qenv';
|
||||
import * as smartarchive from '@push.rocks/smartarchive';
|
||||
import * as smartarray from '@push.rocks/smartarray';
|
||||
import * as smartbrowser from '@push.rocks/smartbrowser';
|
||||
import * as smartdata from '@push.rocks/smartdata';
|
||||
import * as smartdelay from '@push.rocks/smartdelay';
|
||||
@ -18,11 +19,13 @@ import * as smartpromise from '@push.rocks/smartpromise';
|
||||
import * as smartrequest from '@push.rocks/smartrequest';
|
||||
import * as smartstream from '@push.rocks/smartstream';
|
||||
import * as smartunique from '@push.rocks/smartunique';
|
||||
import * as smartxml from '@push.rocks/smartxml';
|
||||
|
||||
export {
|
||||
lik,
|
||||
qenv,
|
||||
smartarchive,
|
||||
smartarray,
|
||||
smartbrowser,
|
||||
smartdata,
|
||||
smartdelay,
|
||||
@ -32,6 +35,7 @@ export {
|
||||
smartrequest,
|
||||
smartstream,
|
||||
smartunique,
|
||||
smartxml,
|
||||
}
|
||||
|
||||
// @tsclass scope
|
||||
|
Reference in New Issue
Block a user