Compare commits
38 Commits
Author | SHA1 | Date | |
---|---|---|---|
c344e47ae6 | |||
209af50a4c | |||
1d0d44dc29 | |||
c4d6403721 | |||
84cab94beb | |||
d17683cd67 | |||
39537c0568 | |||
b768b67641 | |||
25147deb7f | |||
4030bef7a8 | |||
c6964f0310 | |||
9a9f203af2 | |||
174086defc | |||
43c9d3b3b6 | |||
39724b61d6 | |||
d9588f8f65 | |||
6ce6153ccf | |||
ec2d4f9fbc | |||
a19be31381 | |||
9c3f012da7 | |||
8ebbc16bcd | |||
c177193438 | |||
7c07bc59e4 | |||
e4a8d371f7 | |||
1c0e04cb0d | |||
c3f6ef531b | |||
a67a0993d6 | |||
bc43e4c44a | |||
9b2dcd7377 | |||
1eda50ad13 | |||
506a644c6b | |||
555e156b5e | |||
b67e18f2fe | |||
09c9712191 | |||
6258dcdff1 | |||
605b050177 | |||
c97c8e711a | |||
d5654a7bc7 |
123
changelog.md
123
changelog.md
@ -1,5 +1,128 @@
|
||||
# Changelog
|
||||
|
||||
## 2025-04-09 - 1.5.3 - fix(test)
|
||||
Await file writes in Handelsregister tests to ensure all downloads complete before test end
|
||||
|
||||
- Replaced array.map with await Promise.all to properly await asynchronous file writes in test/test.handelsregister.ts
|
||||
- Improved robustness of asynchronous operations in test suite
|
||||
|
||||
## 2025-04-09 - 1.5.2 - fix(readme)
|
||||
Improve .env configuration code block formatting in documentation
|
||||
|
||||
- Wrap the .env variables block in triple backticks for clarity
|
||||
- Ensure consistency in the Markdown styling of code snippets
|
||||
|
||||
## 2025-04-09 - 1.5.1 - fix(core)
|
||||
No changes detected in project files or documentation. This commit is a placeholder to record that nothing was updated.
|
||||
|
||||
|
||||
## 2025-04-09 - 1.5.0 - feat(documentation)
|
||||
Enhance project metadata and documentation with comprehensive usage examples, updated descriptions, and improved keywords.
|
||||
|
||||
- Updated npmextra.json and package.json to refine the project description and keyword list.
|
||||
- Expanded readme.md with detailed sections on environment setup, CRUD operations, bulk JSONL processing, and advanced Handelsregister integrations.
|
||||
- Included advanced workflow examples and error handling strategies in the documentation.
|
||||
- Adjusted test cases (e.g. in test/test.handelsregister.ts) to reflect changes in company name usage.
|
||||
|
||||
## 2025-04-08 - 1.4.6 - fix(tests & jsonl)
|
||||
Improve test structure and refine JSONL parsing for incomplete data
|
||||
|
||||
- Refactored test files to remove redundant get-specific-company tests in test.ts and added missing tests in test.handelsregister.ts
|
||||
- Updated JSONL data processor to conditionally parse remaining data when available
|
||||
|
||||
## 2025-04-05 - 1.4.5 - fix(metadata)
|
||||
Update repository, bugs, and homepage URLs to code.foss.global
|
||||
|
||||
- Repository URL updated from gitlab.com to code.foss.global
|
||||
- Bugs URL updated from gitlab.com to code.foss.global
|
||||
- Homepage URL updated to code.foss.global
|
||||
|
||||
## 2025-04-05 - 1.4.4 - fix(dependencies & tests)
|
||||
Update dependency versions and refine test search query
|
||||
|
||||
- Bumped versions for several dependencies in package.json, including @git.zone/tsbuild, @git.zone/tsbundle, @git.zone/tstest, @push.rocks/tapbundle, @push.rocks/smartdata, @push.rocks/smartfile, @push.rocks/smartpromise, @push.rocks/smartrequest, and @tsclass/tsclass
|
||||
- Updated test file to replace the search query 'Volkswagen' with 'LADR'
|
||||
- Re-enabled the build initial data test by removing tap.skip
|
||||
|
||||
## 2025-01-07 - 1.4.3 - fix(test)
|
||||
Corrected index value in test for fetching specific company data
|
||||
|
||||
- Updated the index from 8 to 7 for the germanParsedRegistration fetch in test
|
||||
|
||||
## 2025-01-07 - 1.4.2 - fix(core)
|
||||
Fix concurrency and download handling in HandelsRegister class and adjust test cases
|
||||
|
||||
- Improved the clickFindButton function to include an argument for results limit.
|
||||
- Enhanced the downloadFile function to rename and ensure files are correctly handled.
|
||||
- Updated searchCompany method to allow specifying a limit on the number of search results.
|
||||
- Adjusted test cases to select specific test data indices and output test files to a dedicated directory.
|
||||
|
||||
## 2025-01-04 - 1.4.1 - fix(core)
|
||||
Fix issues with JSONL data processing and improve error handling in business record validation
|
||||
|
||||
- Fixed JSONL data processing by adding concurrent processing for each JSON line to enhance performance.
|
||||
- Added validation logic in BusinessRecord class to ensure that the mandatory fields are checked.
|
||||
- Adjusted environment variable loading in OpenData class to ensure correct database initialization.
|
||||
- Included missing dependencies and exports in the project files to ensure proper functionality.
|
||||
|
||||
## 2025-01-04 - 1.4.0 - feat(HandelsRegister)
|
||||
Add file download functionality to HandelsRegister
|
||||
|
||||
- Implemented file download feature in the HandelsRegister class.
|
||||
- Configured pages in Puppeteer to allow downloads and set download paths.
|
||||
- Parsed German registration information with more robust error handling.
|
||||
- Added specific methods for downloading and handling 'SI' and 'AD' files.
|
||||
|
||||
## 2025-01-03 - 1.3.1 - fix(HandelsRegister)
|
||||
Refined HandelsRegister functionality for better error handling and response capture.
|
||||
|
||||
- Improved parsing logic in parseGermanRegistration function.
|
||||
- Enhanced navigateToPage and clickFindButton methods with error messages for clarity.
|
||||
- Implemented a new responseListener to handle and log HTTP responses correctly.
|
||||
|
||||
## 2025-01-03 - 1.3.0 - feat(core)
|
||||
Enhanced data handling capabilities and improved company search functionalities.
|
||||
|
||||
- Updated business record handling to support more registration types.
|
||||
- Improved search capabilities for fetching company data with refined registration type matching.
|
||||
- Added robust logging for JSONL data processing with early exit on successful parse.
|
||||
- Reorganized test cases to include specific company data retrieval.
|
||||
|
||||
## 2025-01-02 - 1.2.1 - fix(BusinessRecord)
|
||||
Add missing field registrationType to BusinessRecord data
|
||||
|
||||
- Introduced the 'registrationType' field to the BusinessRecord data schema with possible values 'HRA' or 'HRB'.
|
||||
|
||||
## 2025-01-02 - 1.2.0 - feat(core)
|
||||
Integrate Handelsregister search for company data retrieval
|
||||
|
||||
- Added support for searching company data via Handelsregister.
|
||||
- Replaced GermanBusinessData functionality with JsonlDataProcessor.
|
||||
- Included smartbrowser dependency for handling web requests to Handelsregister.
|
||||
|
||||
## 2025-01-01 - 1.1.5 - fix(GermanBusinessData)
|
||||
Add console log for total records processed at the end of the stream.
|
||||
|
||||
- Ensure that the number of records processed is logged at the end of data stream processing.
|
||||
|
||||
## 2024-12-31 - 1.1.4 - fix(documentation)
|
||||
Update description and keywords in package.json
|
||||
|
||||
- Corrected the package description to reflect the focus on managing, accessing, and updating open data with MongoDB integration.
|
||||
- Expanded the keywords in the package metadata to include data integration and processing terms.
|
||||
- Improved README.md with more extensive setup, usage, and introduction of the library's functionalities.
|
||||
|
||||
## 2024-12-31 - 1.1.3 - fix(core)
|
||||
Added missing license file for project completeness.
|
||||
|
||||
- Introduced a LICENSE file to the project, ensuring clarity on software usage permissions.
|
||||
|
||||
## 2024-12-31 - 1.1.2 - fix(GermanBusinessData)
|
||||
Ensure unique ID generation for BusinessRecord
|
||||
|
||||
- Added generation of a new ID for each BusinessRecord in GermanBusinessData.
|
||||
- Ensures each business record has a unique identifier.
|
||||
|
||||
## 2024-12-31 - 1.1.1 - fix(dependencies)
|
||||
Update dependencies and devDependencies to newer versions.
|
||||
|
||||
|
19
license
Normal file
19
license
Normal file
@ -0,0 +1,19 @@
|
||||
Copyright (c) 2022 Task Venture Capital GmbH (hello@task.vc)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -5,26 +5,34 @@
|
||||
"githost": "gitlab.com",
|
||||
"gitscope": "fin.cx",
|
||||
"gitrepo": "opendata",
|
||||
"description": "A TypeScript-based library for accessing and managing open business data, specifically for German companies.",
|
||||
"description": "A comprehensive TypeScript library that manages open business data for German companies by integrating MongoDB, processing JSONL bulk data, and automating browser interactions for Handelsregister data retrieval.",
|
||||
"npmPackagename": "@fin.cx/opendata",
|
||||
"license": "MIT",
|
||||
"projectDomain": "fin.cx",
|
||||
"keywords": [
|
||||
"TypeScript",
|
||||
"open data",
|
||||
"business data",
|
||||
"German companies",
|
||||
"data management",
|
||||
"business registry",
|
||||
"npm package",
|
||||
"database",
|
||||
"business data",
|
||||
"MongoDB",
|
||||
"automation"
|
||||
"JSONL",
|
||||
"bulk processing",
|
||||
"data management",
|
||||
"automation",
|
||||
"browser automation",
|
||||
"Handelsregister",
|
||||
"web scraping",
|
||||
"file processing",
|
||||
"business registry",
|
||||
"data retrieval"
|
||||
]
|
||||
}
|
||||
},
|
||||
"npmci": {
|
||||
"npmGlobalTools": [],
|
||||
"npmAccessLevel": "public"
|
||||
},
|
||||
"tsdoc": {
|
||||
"legal": "\n## License and Legal Information\n\nThis repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository. \n\n**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.\n\n### Trademarks\n\nThis project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.\n\n### Company Information\n\nTask Venture Capital GmbH \nRegistered at District court Bremen HRB 35230 HB, Germany\n\nFor any legal inquiries or if you require further information, please contact us via email at hello@task.vc.\n\nBy using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.\n"
|
||||
}
|
||||
}
|
56
package.json
56
package.json
@ -1,8 +1,8 @@
|
||||
{
|
||||
"name": "@fin.cx/opendata",
|
||||
"version": "1.1.1",
|
||||
"version": "1.5.3",
|
||||
"private": false,
|
||||
"description": "A TypeScript-based library for accessing and managing open business data, specifically for German companies.",
|
||||
"description": "A comprehensive TypeScript library that manages open business data for German companies by integrating MongoDB, processing JSONL bulk data, and automating browser interactions for Handelsregister data retrieval.",
|
||||
"main": "dist_ts/index.js",
|
||||
"typings": "dist_ts/index.d.ts",
|
||||
"type": "module",
|
||||
@ -14,32 +14,38 @@
|
||||
"buildDocs": "(tsdoc)"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@git.zone/tsbuild": "^2.2.0",
|
||||
"@git.zone/tsbundle": "^2.1.0",
|
||||
"@git.zone/tsbuild": "^2.3.2",
|
||||
"@git.zone/tsbundle": "^2.2.5",
|
||||
"@git.zone/tsrun": "^1.3.3",
|
||||
"@git.zone/tstest": "^1.0.90",
|
||||
"@push.rocks/tapbundle": "^5.5.4",
|
||||
"@types/node": "^22.10.2"
|
||||
"@git.zone/tstest": "^1.0.96",
|
||||
"@push.rocks/tapbundle": "^5.6.2",
|
||||
"@types/node": "^22.14.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@push.rocks/lik": "^6.1.0",
|
||||
"@push.rocks/qenv": "^6.1.0",
|
||||
"@push.rocks/smartarchive": "^4.0.39",
|
||||
"@push.rocks/smartdata": "^5.2.10",
|
||||
"@push.rocks/smartarray": "^1.1.0",
|
||||
"@push.rocks/smartbrowser": "^2.0.8",
|
||||
"@push.rocks/smartdata": "^5.2.12",
|
||||
"@push.rocks/smartdelay": "^3.0.5",
|
||||
"@push.rocks/smartfile": "^11.0.23",
|
||||
"@push.rocks/smartfile": "^11.2.0",
|
||||
"@push.rocks/smartpath": "^5.0.18",
|
||||
"@push.rocks/smartpromise": "^4.0.4",
|
||||
"@push.rocks/smartrequest": "^2.0.23",
|
||||
"@push.rocks/smartstream": "^3.2.5"
|
||||
"@push.rocks/smartpromise": "^4.2.3",
|
||||
"@push.rocks/smartrequest": "^2.1.0",
|
||||
"@push.rocks/smartstream": "^3.2.5",
|
||||
"@push.rocks/smartunique": "^3.0.9",
|
||||
"@push.rocks/smartxml": "^1.1.1",
|
||||
"@tsclass/tsclass": "^8.2.0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://gitlab.com/fin.cx/opendata.git"
|
||||
"url": "https://code.foss.global/fin.cx/opendata.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://gitlab.com/fin.cx/opendata/issues"
|
||||
"url": "https://code.foss.global/fin.cx/opendata/issues"
|
||||
},
|
||||
"homepage": "https://gitlab.com/fin.cx/opendata#readme",
|
||||
"homepage": "https://code.foss.global/fin.cx/opendata#readme",
|
||||
"browserslist": [
|
||||
"last 1 chrome versions"
|
||||
],
|
||||
@ -58,13 +64,19 @@
|
||||
"keywords": [
|
||||
"TypeScript",
|
||||
"open data",
|
||||
"business data",
|
||||
"German companies",
|
||||
"data management",
|
||||
"business registry",
|
||||
"npm package",
|
||||
"database",
|
||||
"business data",
|
||||
"MongoDB",
|
||||
"automation"
|
||||
]
|
||||
"JSONL",
|
||||
"bulk processing",
|
||||
"data management",
|
||||
"automation",
|
||||
"browser automation",
|
||||
"Handelsregister",
|
||||
"web scraping",
|
||||
"file processing",
|
||||
"business registry",
|
||||
"data retrieval"
|
||||
],
|
||||
"packageManager": "pnpm@10.7.0+sha512.6b865ad4b62a1d9842b61d674a393903b871d9244954f652b8842c2b553c72176b278f64c463e52d40fff8aba385c235c8c9ecf5cc7de4fd78b8bb6d49633ab6"
|
||||
}
|
||||
|
2840
pnpm-lock.yaml
generated
2840
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
796
readme.md
796
readme.md
@ -1,32 +1,56 @@
|
||||
```markdown
|
||||
# @fin.cx/opendata
|
||||
open business data
|
||||
A TypeScript library for accessing, managing, and updating open business data, focused on German companies and integrating with MongoDB.
|
||||
|
||||
## Install
|
||||
|
||||
To install the `@fin.cx/opendata` package, you can use npm or yarn as your package manager. Here's how you can do it:
|
||||
To install the @fin.cx/opendata package, you can use npm or yarn as your package manager. The installation process is simple and straightforward.
|
||||
|
||||
Using npm:
|
||||
|
||||
```bash
|
||||
npm install @fin.cx/opendata
|
||||
```
|
||||
npm install @fin.cx/opendata
|
||||
|
||||
Using yarn:
|
||||
|
||||
```bash
|
||||
yarn add @fin.cx/opendata
|
||||
```
|
||||
yarn add @fin.cx/opendata
|
||||
|
||||
## Usage
|
||||
|
||||
The `@fin.cx/opendata` package offers functionalities for handling open business data, with a primary focus on German business data. Let's explore its capabilities through detailed examples.
|
||||
The @fin.cx/opendata library is a versatile tool that empowers developers to integrate comprehensive open business data into their systems. This library is particularly tailored for German companies, offering functionalities that include creating, retrieving, updating, and deleting business records as well as processing large volumes of JSONL data from external sources. In addition to core database operations via MongoDB, the library provides integration with web-based services, primarily through a hands-on Handelsregister processor that utilizes browser automation for searching and downloading documents.
|
||||
|
||||
### Setting Up
|
||||
In this section, we will extensively detail multiple usage scenarios, ensuring that every feature the module offers is thoroughly explored. All examples in this documentation employ ECMAScript Module (ESM) syntax and TypeScript, highlighting proper asynchronous handling, error management, and advanced integration with other dependencies. We will walk you through environment setup, initializing the package, managing business records, processing bulk JSONL data, interacting with the Handelsregister for on-demand document retrieval, and much more. Each example is constructed to expose every nuance of the module's behavior and usage.
|
||||
|
||||
#### Importing the Module
|
||||
For clarity, we will split this section into multiple parts:
|
||||
|
||||
Begin by importing necessary components from the `@fin.cx/opendata` package. You'll also need to set up some environment variables for the MongoDB instance.
|
||||
1. Environment Setup and Initializing the Library
|
||||
2. Managing Business Records (CRUD Operations)
|
||||
3. Bulk Data Processing and Importing via JSONL Streams
|
||||
4. Integrating with the Handelsregister: Detailed Demonstrations
|
||||
5. Advanced Examples: Combined Operations and Edge Cases
|
||||
6. Error Handling and Data Validation
|
||||
7. Testing and Automated Workflows
|
||||
|
||||
Throughout these examples, we will examine how each class and method interacts with the underlying MongoDB database and the system's file structure. We assume you have a running MongoDB instance and that your environment is configured with the necessary variables.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 1. Environment Setup and Initializing the Library
|
||||
|
||||
Before diving into any operations, ensure that your development environment is properly configured. The @fin.cx/opendata library mandates several environment variables for connecting to your MongoDB instance. For a smooth experience, it is advisable to use a .env file or any secure secrets management tool that suits your workflow. The required environment variables include:
|
||||
|
||||
• MONGODB_URL – The connection string URL for your MongoDB instance.
|
||||
• MONGODB_NAME – The name of the database that the module will interact with.
|
||||
• MONGODB_USER – MongoDB username required for authentication.
|
||||
• MONGODB_PASS – The password for the MongoDB user.
|
||||
|
||||
Below is an example .env file for local development:
|
||||
|
||||
```
|
||||
MONGODB_URL=mongodb://localhost:27017
|
||||
MONGODB_NAME=opendataDB
|
||||
MONGODB_USER=myUser
|
||||
MONGODB_PASS=myPass
|
||||
```
|
||||
|
||||
Once these variables are set, the library can fetch them using the integrated qenv tool. The following code snippet demonstrates how to import and initialize the library:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
@ -34,220 +58,608 @@ import { OpenData } from '@fin.cx/opendata';
|
||||
const startOpenDataInstance = async () => {
|
||||
const openData = new OpenData();
|
||||
|
||||
await openData.start(); // Start the open data instance
|
||||
console.log('OpenData instance started.');
|
||||
try {
|
||||
console.log('Starting OpenData instance...');
|
||||
await openData.start();
|
||||
console.log('OpenData instance started successfully.');
|
||||
|
||||
// your code here
|
||||
// Invoke sample operations:
|
||||
await demonstrateBusinessRecordsOperations(openData);
|
||||
await demonstrateBulkDataProcessing(openData);
|
||||
await demonstrateHandelsregisterOperations(openData);
|
||||
|
||||
await openData.stop();
|
||||
console.log('OpenData instance stopped.');
|
||||
} catch (error) {
|
||||
console.error('Error during initialization:', error);
|
||||
} finally {
|
||||
console.log('Stopping OpenData instance...');
|
||||
await openData.stop();
|
||||
console.log('OpenData instance stopped.');
|
||||
}
|
||||
};
|
||||
|
||||
startOpenDataInstance().catch(console.error);
|
||||
startOpenDataInstance();
|
||||
```
|
||||
|
||||
### BusinessRecord Usage
|
||||
In this snippet, we import the OpenData class from the module and execute its start and stop routines to ensure that the MongoDB connection is properly initialized and terminated. Notice that we move on to different demonstration functions that showcase individual features.
|
||||
|
||||
A `BusinessRecord` is the main entity you'll be working with. Here's how you manage business records.
|
||||
────────────────────────────────────────────
|
||||
### 2. Managing Business Records (CRUD Operations)
|
||||
|
||||
#### Creating a New BusinessRecord
|
||||
Central to the @fin.cx/opendata library is the management of business records. The BusinessRecord class encapsulates data pertaining to companies, allowing you to create new records, retrieve existing ones, update information, and delete entries when necessary. The following examples illustrate each operation within a robust context.
|
||||
|
||||
#### a) Creating a Business Record
|
||||
|
||||
Creating a new business record in the openData instance is straightforward. You instantiate a new record and populate its data properties with relevant details such as company name, address, registration number, managing directors, and much more. The sample below uses the embedded CBusinessRecord manager to generate a new record:
|
||||
|
||||
```typescript
|
||||
import { BusinessRecord } from '@fin.cx/opendata';
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
const createBusinessRecord = async (openData: OpenData) => {
|
||||
export const createBusinessRecordExample = async (openData: OpenData) => {
|
||||
const businessRecord = new openData.CBusinessRecord();
|
||||
businessRecord.data.name = "Example Company";
|
||||
businessRecord.data.address = "Example Street 1";
|
||||
businessRecord.data.postalCode = "12345";
|
||||
businessRecord.data.city = "Example City";
|
||||
businessRecord.data.country = "Germany";
|
||||
businessRecord.data.phone = "+49 123 456789";
|
||||
businessRecord.data.email = "contact@example.com";
|
||||
businessRecord.data.website = "https://example.com";
|
||||
businessRecord.data.businessType = "GmbH";
|
||||
businessRecord.data.registrationNumber = "HRB 123456";
|
||||
businessRecord.data.registrationCourt = "Munich";
|
||||
businessRecord.data.legalForm = "GmbH";
|
||||
businessRecord.data.managingDirectors = ["John Doe", "Jane Smith"];
|
||||
businessRecord.data.foundingDate = new Date().toISOString();
|
||||
businessRecord.data.capital = "50,000 EUR";
|
||||
businessRecord.data.purpose = "Tech Solutions";
|
||||
businessRecord.data.lastUpdate = new Date().toISOString();
|
||||
|
||||
await businessRecord.save();
|
||||
console.log('BusinessRecord saved:', businessRecord);
|
||||
businessRecord.data = {
|
||||
name: "Innovative Solutions GmbH",
|
||||
address: "Musterstraße 1",
|
||||
postalCode: "10115",
|
||||
city: "Berlin",
|
||||
country: "Germany",
|
||||
phone: "+49 30 123456",
|
||||
email: "contact@innovativesolutions.de",
|
||||
website: "https://innovativesolutions.de",
|
||||
businessType: "GmbH",
|
||||
registrationId: "District court Berlin HRB 987654",
|
||||
legalForm: "GmbH",
|
||||
managingDirectors: ["Max Mustermann", "Erika Musterfrau"],
|
||||
foundingDate: new Date("2018-05-10").toISOString(),
|
||||
capital: "250,000 EUR",
|
||||
purpose: "Technology development and consulting services",
|
||||
lastUpdate: new Date().toISOString()
|
||||
};
|
||||
|
||||
try {
|
||||
await businessRecord.save();
|
||||
console.log('BusinessRecord created successfully:', businessRecord);
|
||||
} catch (error) {
|
||||
console.error('Error creating business record:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
#### Retrieving BusinessRecord
|
||||
In this example, after setting the business record fields, the record is saved to the MongoDB collection using the save method. The system ensures that the newly created record receives a unique identifier by generating a new ID when saving the document.
|
||||
|
||||
Retrieve a business record by querying the database.
|
||||
#### b) Retrieving Business Records
|
||||
|
||||
To retrieve business records, you can search by various fields such as city, business name, or registration details. The system utilizes MongoDB queries to filter and return relevant documents. Below is a sample function that retrieves all records for companies based in a particular city:
|
||||
|
||||
```typescript
|
||||
import { BusinessRecord } from '@fin.cx/opendata';
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
import type { BusinessRecord } from '@fin.cx/opendata';
|
||||
|
||||
const findBusinessRecord = async (openData: OpenData) => {
|
||||
const businessRecords = await openData.db.collection<BusinessRecord>('businessrecords').find().toArray();
|
||||
console.log('Retrieved Business Records:', businessRecords);
|
||||
export const retrieveRecordsByCity = async (openData: OpenData, city: string) => {
|
||||
try {
|
||||
const records = await openData.db
|
||||
.collection<BusinessRecord>('businessrecords')
|
||||
.find({ city })
|
||||
.toArray();
|
||||
|
||||
console.log(`Retrieved ${records.length} records for city ${city}.`);
|
||||
console.log(records);
|
||||
return records;
|
||||
} catch (error) {
|
||||
console.error('Error retrieving business records:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
### Updating Business Data
|
||||
This method queries the "businessrecords" collection using a simple filter and converts the cursor into an array of records. You can extend the query to filter by more sophisticated criteria as needed.
|
||||
|
||||
The `GermanBusinessData` class handles the specifics of updating and maintaining the data.
|
||||
#### c) Updating an Existing Business Record
|
||||
|
||||
#### Updating German Business Data
|
||||
Modifying the details of an exisiting record is a common operation. First, you need to retrieve the record from the database. Once the record is loaded, you can make changes to its properties and then save the updated record back to the database. The following example demonstrates this with a change to the company’s phone number and last update timestamp:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
const updateGermanBusinessData = async (openData: OpenData) => {
|
||||
await openData.germanBusinesses.update();
|
||||
console.log('German business data updated.');
|
||||
};
|
||||
|
||||
startOpenDataInstance()
|
||||
.then((openData) => {
|
||||
// Use the instance
|
||||
return updateGermanBusinessData(openData);
|
||||
})
|
||||
.catch(console.error);
|
||||
```
|
||||
|
||||
This function downloads the latest data from the German business data source, processes it, and updates the local database.
|
||||
|
||||
### Detailed Class Structures and Methods
|
||||
|
||||
#### OpenData Class
|
||||
|
||||
The `OpenData` class is the main entry point.
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
class OpenData {
|
||||
db: plugins.smartdata.SmartdataDb;
|
||||
germanBusinesses: GermanBusinessData;
|
||||
|
||||
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
|
||||
|
||||
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
|
||||
|
||||
public async start() {
|
||||
// Initialize smart data DB
|
||||
this.db = new plugins.smartdata.SmartdataDb({
|
||||
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
|
||||
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
|
||||
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
|
||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||
});
|
||||
|
||||
await this.db.init();
|
||||
this.germanBusinesses = new GermanBusinessData(this);
|
||||
await this.germanBusinesses.start();
|
||||
}
|
||||
|
||||
public async stop() {
|
||||
// Clean up resources if necessary
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### GermanBusinessData Class
|
||||
|
||||
The `GermanBusinessData` class handles the specifics of German business data.
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
import * as plugins from './plugins';
|
||||
import * as paths from './paths';
|
||||
|
||||
class GermanBusinessData {
|
||||
public openDataRef: OpenData;
|
||||
|
||||
constructor(openDataRef: OpenData) {
|
||||
this.openDataRef = openDataRef;
|
||||
}
|
||||
|
||||
public async start() {
|
||||
await this.update();
|
||||
}
|
||||
|
||||
public async stop() {
|
||||
// Stop any ongoing processing
|
||||
}
|
||||
|
||||
public async update() {
|
||||
const dataUrl = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2';
|
||||
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
||||
|
||||
if (!dataExists) {
|
||||
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
||||
export const updateBusinessRecordExample = async (openData: OpenData, recordId: string) => {
|
||||
try {
|
||||
// Retrieve the record by its id using the manager’s helper
|
||||
const businessRecord = await openData.CBusinessRecord.getInstance(recordId);
|
||||
if (!businessRecord) {
|
||||
console.log(`No business record found with id: ${recordId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrl);
|
||||
const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles();
|
||||
// Update some fields
|
||||
businessRecord.data.phone = "+49 30 654321";
|
||||
businessRecord.data.lastUpdate = new Date().toISOString();
|
||||
|
||||
let totalRecordsCounter = 0;
|
||||
let nextRest: string = '';
|
||||
|
||||
jsonlDataStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: plugins.smartfile.StreamFile, streamToolsArg) => {
|
||||
const readStream = await chunkArg.createReadStream();
|
||||
readStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: Buffer, streamToolsArg) => {
|
||||
const currentString = nextRest + chunkArg.toString();
|
||||
|
||||
const lines = currentString.split('\n');
|
||||
nextRest = lines.pop();
|
||||
|
||||
for (const line of lines) {
|
||||
let entry: any;
|
||||
try {
|
||||
entry = JSON.parse(line);
|
||||
} catch (err) {
|
||||
console.error('Error parsing line:', err);
|
||||
continue;
|
||||
}
|
||||
|
||||
totalRecordsCounter++;
|
||||
if (totalRecordsCounter % 10000 === 0) {
|
||||
console.log(`${totalRecordsCounter} total records.`);
|
||||
}
|
||||
|
||||
const businessRecord = new this.openDataRef.CBusinessRecord();
|
||||
businessRecord.data.name = entry?.name;
|
||||
|
||||
await businessRecord.save();
|
||||
}
|
||||
},
|
||||
finalFunction: async (streamToolsArg) => {
|
||||
if (nextRest) {
|
||||
try {
|
||||
JSON.parse(nextRest);
|
||||
} catch (err) {
|
||||
console.error('Error parsing final chunk:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
},
|
||||
})
|
||||
);
|
||||
// Save the updated record into the database
|
||||
await businessRecord.save();
|
||||
console.log("Business record updated successfully:", businessRecord);
|
||||
} catch (error) {
|
||||
console.error('Error updating business record:', error);
|
||||
}
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
### Conclusion
|
||||
This code snippet presents a robust pattern where errors are caught and logged, ensuring that any update issues can be diagnosed easily.
|
||||
|
||||
This module is designed to make it easier to manage open business data, especially focusing on German business data. The examples above demonstrate the core functionalities, including starting and stopping the service, managing business records, and updating data.
|
||||
#### d) Deleting a Business Record
|
||||
|
||||
As you work with `@fin.cx/opendata`, you’ll discover it offers a robust and flexible approach for working with open business data seamlessly. Happy coding!
|
||||
The deletion of a record is as vital as its creation and modification. The library provides a delete method that removes the specified record from the database. Below is a simple function to delete a business record by its identifier:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
export const deleteBusinessRecordExample = async (openData: OpenData, recordId: string) => {
|
||||
try {
|
||||
const businessRecord = await openData.CBusinessRecord.getInstance(recordId);
|
||||
if (businessRecord) {
|
||||
await businessRecord.delete();
|
||||
console.log(`Successfully deleted business record with id: ${recordId}`);
|
||||
} else {
|
||||
console.log(`No business record found with id: ${recordId}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error deleting business record:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
undefined
|
||||
|
||||
Through this example, you can integrate safe deletion practices in your application, removing outdated or incorrect records without compromising database integrity.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 3. Bulk Data Processing and Importing via JSONL Streams
|
||||
|
||||
One of the powerful features of the @fin.cx/opendata module is its ability to process large datasets provided in the JSON Lines (JSONL) format. The JsonlDataProcessor class is designed to handle streaming data, processing each record concurrently, and efficiently updating the database.
|
||||
|
||||
This bulk data ingestion mechanism is particularly useful when dealing with large-scale datasets such as the German companies' open data that the module fetches from official data portals. The process involves decompressing, streaming, and parsing data by leveraging pipelines of smart streams and concurrent processors.
|
||||
|
||||
Below is an extended example demonstrating how to process a JSONL data file from a given URL:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
// This function demonstrates bulk data processing using the JSONL data processor.
|
||||
// The dataUrl parameter is optional and, if not provided, defaults to the official open data URL.
|
||||
export const demonstrateBulkDataProcessing = async (openData: OpenData, dataUrl?: string) => {
|
||||
try {
|
||||
console.log('Starting bulk data processing...');
|
||||
await openData.jsonLDataProcessor.processDataFromUrl(dataUrl);
|
||||
console.log('Bulk data processing completed successfully.');
|
||||
} catch (error) {
|
||||
console.error('Error during bulk data processing:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
In the processDataFromUrl implementation, the library uses a pipeline of smart streams. After downloading the compressed file, it decompresses it and splits the content into discrete JSON lines. The processor then concurrently applies a handler function to each JSON entry. This function extracts relevant company details, instantiates a new BusinessRecord, associates parsed data (for example, registration attributes from German registers), and saves the record to MongoDB.
|
||||
|
||||
A deeper dive into the processing mechanism:
|
||||
• The JSONL data is received as a binary (Buffer) stream.
|
||||
• The stream is piped into a duplex stream that splits the text by newline characters.
|
||||
• Each line is parsed into a JSON object and passed into an asynchronous processing function.
|
||||
• This function creates a new business record and sets properties such as the company name and its registration details, derived from the JSON entry.
|
||||
• As the processor moves through the stream, it logs progress every 10,000 records to give feedback on its bulk processing status.
|
||||
|
||||
By supporting concurrency (with a configurable concurrency limit, e.g., 1000 simultaneous operations), the library ensures that even gigabytes of data are processed efficiently without hitting memory bottlenecks.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 4. Integrating with the Handelsregister: Detailed Demonstrations
|
||||
|
||||
In addition to CRUD operations and bulk processing, the module includes an integrated Handelsregister system. This sophisticated component leverages a headless browser (via the smartbrowser instance) to interact with the official Handelsregister website. Through this integration, you can search for companies, navigate to specific pages, trigger file downloads (such as PDF or XML data), and parse the downloaded content for further processing.
|
||||
|
||||
#### a) Starting the Handelsregister
|
||||
|
||||
Before executing any search or download operations, the Handelsregister system must be started. The start method initializes required resources including starting a headless browser, ensuring download directories are created, and preparing asynchronous stacks for exclusive execution.
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
export const demonstrateHandelsregisterStart = async (openData: OpenData) => {
|
||||
try {
|
||||
console.log('Starting Handelsregister services...');
|
||||
await openData.handelsregister.start();
|
||||
console.log('Handelsregister ready.');
|
||||
} catch (error) {
|
||||
console.error('Error starting Handelsregister service:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
#### b) Searching for a Company Using the Handelsregister
|
||||
|
||||
A common use case is to search for a company by its name. The Handelsregister system creates a dedicated browser page, enters the search criteria into the input fields, selects the appropriate options (such as radio buttons for search type), and clicks the “Find” button. The following function demonstrates how to incorporate these actions:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
export const searchCompanyExample = async (openData: OpenData, companyName: string, limit: number = 100) => {
|
||||
try {
|
||||
console.log(`Searching for company with name "${companyName}"...`);
|
||||
const records = await openData.handelsregister.searchCompany(companyName, limit);
|
||||
console.log(`Found ${records.length} matching records for "${companyName}".`);
|
||||
console.log('Records:', records);
|
||||
return records;
|
||||
} catch (error) {
|
||||
console.error('Error searching for company:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
In this scenario, the Handelsregister component uses internal helper functions:
|
||||
• getNewPage – to create a new browser page with file download behavior enabled.
|
||||
• navigateToPage – which navigates to the “Normal search” page.
|
||||
• Input events – to simulate typing in search criteria.
|
||||
• UI interactions – to select options and trigger the search.
|
||||
|
||||
#### c) Retrieving Detailed Data and Triggering Downloads
|
||||
|
||||
After obtaining general search results, you may wish to retrieve more detailed information about a specific company. Provided you have the parsed registration data (which typically includes the registration court, type, and number), you can instruct the system to navigate to a detailed view and trigger file downloads. These files might include the company’s official registry entry (as an XML file) and additional documents (such as a PDF summary).
|
||||
|
||||
The example below details how to use the Handelsregister functionality to focus on a specific company by leveraging its registration details, then download both SI and AD files:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
export const getDetailedCompanyData = async (openData: OpenData, registrationData: { court?: string; type?: 'HRA' | 'HRB' | 'GnR' | 'PR' | 'VR' | 'GsR'; number?: string; }) => {
|
||||
try {
|
||||
console.log('Retrieving detailed company data...');
|
||||
const result = await openData.handelsregister.getSpecificCompany(registrationData);
|
||||
console.log('Retrieved detailed company data.');
|
||||
console.log('Business Records:', result.businessRecords);
|
||||
console.log('Downloaded Files:', result.files);
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error('Error retrieving detailed company data:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
In the above example:
|
||||
• The getSpecificCompany method triggers navigation through various UI elements:
|
||||
– Selecting the register type via a dropdown.
|
||||
– Inputting the register number.
|
||||
– Choosing the appropriate register court.
|
||||
• Then, after clicking the “Find” button, the system waits for the results, verifies the visual components on the page, and initiates file downloads.
|
||||
• The downloaded files are renamed according to their type (SI for XML and AD for PDF) and are returned for further processing or storage.
|
||||
|
||||
#### d) Downloading and Processing Files
|
||||
|
||||
The Handelsregister component not only triggers file downloads but also includes utility functions that wait for downloads to complete, clear temporary directories, and output the file objects. You may want to use these file objects to persist data locally, parse file content, or send the data downstream for further analysis.
|
||||
|
||||
Below is an example that covers downloading and saving the files into a custom directory for post-download analysis:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
import * as path from 'path';
|
||||
|
||||
export const downloadAndSaveFilesExample = async (openData: OpenData, registrationData: { court?: string; type?: 'HRA' | 'HRB' | 'GnR' | 'PR' | 'VR' | 'GsR'; number?: string; }) => {
|
||||
try {
|
||||
console.log('Initiating specific company download...');
|
||||
const result = await openData.handelsregister.getSpecificCompany(registrationData);
|
||||
const saveDirectory = path.join(process.cwd(), 'downloaded_files');
|
||||
|
||||
// Save each downloaded file to the specified directory
|
||||
for (const file of result.files) {
|
||||
await file.writeToDir(saveDirectory);
|
||||
console.log(`File saved: ${file.path}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error during file download and save process:', error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
This function demonstrates a complete flow from launching the Handelsregister detailed company search to saving the downloaded files to disk. This example is particularly useful in scenarios where the downloaded documents need to be processed further, such as converting XML to JSON or extracting text from PDFs.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 5. Advanced Examples: Combined Operations and Edge Cases
|
||||
|
||||
Given the numerous functionalities offered by the library, you can combine various operations to create more complex workflows. One such example is an end-to-end pipeline that:
|
||||
1. Initializes the open data instance.
|
||||
2. Processes an initial bulk data import.
|
||||
3. Searches for key business records that match specific criteria.
|
||||
4. Updates individual records based on additional data retrieved from the Handelsregister.
|
||||
5. Handles error conditions and retries processes where necessary.
|
||||
|
||||
The following advanced example integrates these steps:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
const advancedWorkflowExample = async () => {
|
||||
const openData = new OpenData();
|
||||
|
||||
try {
|
||||
console.log('Starting advanced workflow...');
|
||||
await openData.start();
|
||||
|
||||
// Step 1: Bulk data import from external JSONL source
|
||||
console.log('Building initial database from bulk import...');
|
||||
await openData.buildInitialDb();
|
||||
|
||||
// Step 2: Search for companies in a selected area (for instance, Munich)
|
||||
console.log('Retrieving companies located in Munich...');
|
||||
const munichRecords = await openData.db
|
||||
.collection('businessrecords')
|
||||
.find({ city: "Munich" })
|
||||
.toArray();
|
||||
console.log(`Found ${munichRecords.length} companies in Munich.`);
|
||||
|
||||
// Step 3: For each record, perform an update operation based on new file downloads
|
||||
for (const record of munichRecords) {
|
||||
try {
|
||||
console.log(`Updating record for company: ${record.data.name}`);
|
||||
// Assuming the record contains parsed registration info
|
||||
if (record.data.germanParsedRegistration) {
|
||||
const detailedData = await openData.handelsregister.getSpecificCompany(record.data.germanParsedRegistration);
|
||||
// Update business record with new information (e.g., registration files or updated details)
|
||||
record.data.lastUpdate = new Date().toISOString();
|
||||
// You might want to add additional fields based on the downloaded file data
|
||||
await record.save();
|
||||
console.log(`Updated record for ${record.data.name}.`);
|
||||
} else {
|
||||
console.log(`No registration data available for ${record.data.name}; skipping update.`);
|
||||
}
|
||||
} catch (innerError) {
|
||||
console.error(`Error updating record for ${record.data.name}:`, innerError);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Demonstrate retrieval and deletion
|
||||
const recordToDeleteId = munichRecords.length > 0 ? munichRecords[0].id : null;
|
||||
if (recordToDeleteId) {
|
||||
console.log(`Deleting record with id: ${recordToDeleteId}`);
|
||||
const recordToDelete = await openData.CBusinessRecord.getInstance(recordToDeleteId);
|
||||
if (recordToDelete) {
|
||||
await recordToDelete.delete();
|
||||
console.log(`Record ${recordToDeleteId} deleted successfully.`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Advanced workflow encountered an error:', error);
|
||||
} finally {
|
||||
console.log('Ending advanced workflow and stopping OpenData instance.');
|
||||
await openData.stop();
|
||||
}
|
||||
};
|
||||
|
||||
advancedWorkflowExample();
|
||||
```
|
||||
|
||||
This advanced workflow not only illustrates the coordinated use of bulk data import, search, update, and delete operations but also demonstrates the integration of browser automation for fetching detailed data. The error handling at each step ensures that even if a particular operation fails, the workflow continues in a controlled fashion.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 6. Error Handling and Data Validation
|
||||
|
||||
Robust systems must gracefully handle errors and ensure data consistency. The @fin.cx/opendata library has built-in error handling for asynchronous operations, whether connecting to MongoDB, processing JSON streams, or interacting with web pages. In addition, each BusinessRecord instance provides a validate method that performs basic checks (for instance, ensuring that a company name is present) before a record is saved into the database.
|
||||
|
||||
The snippet below shows how to wrap operations in try/catch blocks and use the validate method:
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
export const validateAndSaveRecord = async (openData: OpenData) => {
|
||||
const record = new openData.CBusinessRecord();
|
||||
record.data = {
|
||||
name: "Validation Test Corp",
|
||||
address: "Teststraße 99",
|
||||
postalCode: "12345",
|
||||
city: "Teststadt",
|
||||
country: "Germany",
|
||||
phone: "+49 123 456789",
|
||||
email: "test@testcorp.de",
|
||||
businessType: "AG",
|
||||
registrationId: "District court Teststadt HRB 111111",
|
||||
legalForm: "AG",
|
||||
managingDirectors: ["Test Director"],
|
||||
foundingDate: new Date().toISOString(),
|
||||
capital: "1,000,000 EUR",
|
||||
purpose: "Testing for data validation",
|
||||
lastUpdate: new Date().toISOString()
|
||||
};
|
||||
|
||||
try {
|
||||
// Validate record data before saving.
|
||||
await record.validate();
|
||||
await record.save();
|
||||
console.log("Record validated and saved successfully.");
|
||||
} catch (error) {
|
||||
console.error("Error validating or saving record:", error);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
Using proper error handling ensures that the entire system remains reliable, and any data validation issues are caught early during development or in production.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### 7. Testing and Automated Workflows
|
||||
|
||||
To support continuous integration and adherence to best practices, the @fin.cx/opendata module includes tests written with @push.rocks/tapbundle. You should consider incorporating these tests in your development workflow. The tests verify all main functionalities including instance initialization, bulk data import, Handelsregister operations, and CRUD operations for BusinessRecords.
|
||||
|
||||
Below is an example of a simple test written in TypeScript using ESM that makes use of the module:
|
||||
|
||||
```typescript
|
||||
import { expect, tap } from '@push.rocks/tapbundle';
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
let testOpenDataInstance: OpenData;
|
||||
|
||||
tap.test('Instance creation', async () => {
|
||||
testOpenDataInstance = new OpenData();
|
||||
expect(testOpenDataInstance).toBeInstanceOf(OpenData);
|
||||
});
|
||||
|
||||
tap.test('Start instance', async () => {
|
||||
await testOpenDataInstance.start();
|
||||
});
|
||||
|
||||
tap.test('Perform bulk import', async () => {
|
||||
await testOpenDataInstance.buildInitialDb();
|
||||
});
|
||||
|
||||
tap.test('Stop instance', async () => {
|
||||
await testOpenDataInstance.stop();
|
||||
});
|
||||
|
||||
tap.start();
|
||||
```
|
||||
|
||||
This test code is designed to verify that the OpenData instance is successfully created, started, performs the critical bulk import operation, and is properly shutdown. Integration tests for the Handelsregister functionality follow a similar pattern and ensure that the browser automation routines and file download processes complete without errors.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### Comprehensive Example: Full Cycle from Initialization to Cleanup
|
||||
|
||||
To better illustrate how one might combine several aspects of the module in a production scenario, here's a comprehensive example that ties together initialization, CRUD operations, bulk processing, and Handelsregister interactions. This full-cycle example is written in TypeScript using ESM syntax and demonstrates how to build a production-grade data update and management pipeline.
|
||||
|
||||
```typescript
|
||||
import { OpenData } from '@fin.cx/opendata';
|
||||
|
||||
const runFullCyclePipeline = async () => {
|
||||
const openData = new OpenData();
|
||||
|
||||
try {
|
||||
// Initialize the module and connect to MongoDB
|
||||
console.log('Initializing the OpenData module...');
|
||||
await openData.start();
|
||||
|
||||
// Step 1: Bulk Import - Build the initial database from downloaded open data
|
||||
console.log('Starting bulk data import from JSONL source...');
|
||||
await openData.buildInitialDb();
|
||||
|
||||
// Step 2: Business Record Management - Create a sample business record
|
||||
console.log('Creating a new business record...');
|
||||
const sampleRecord = new openData.CBusinessRecord();
|
||||
sampleRecord.data = {
|
||||
name: "Sample Enterprise GmbH",
|
||||
address: "Innovation Avenue 101",
|
||||
postalCode: "80807",
|
||||
city: "Munich",
|
||||
country: "Germany",
|
||||
phone: "+49 89 111222",
|
||||
email: "info@sampleenterprise.de",
|
||||
website: "https://sampleenterprise.de",
|
||||
businessType: "GmbH",
|
||||
registrationId: "District court Munich HRB 555555",
|
||||
legalForm: "GmbH",
|
||||
managingDirectors: ["Director A", "Director B"],
|
||||
foundingDate: new Date("2015-06-15").toISOString(),
|
||||
capital: "500,000 EUR",
|
||||
purpose: "Holistic business solutions and data processing",
|
||||
lastUpdate: new Date().toISOString()
|
||||
};
|
||||
|
||||
await sampleRecord.save();
|
||||
console.log('Sample business record created with id:', sampleRecord.id);
|
||||
|
||||
// Step 3: Retrieve business records for a specific location
|
||||
console.log('Retrieving business records for Munich...');
|
||||
const munichRecords = await openData.db
|
||||
.collection('businessrecords')
|
||||
.find({ city: "Munich" })
|
||||
.toArray();
|
||||
console.log(`Found ${munichRecords.length} records for Munich.`);
|
||||
|
||||
// Step 4: Update an existing record
|
||||
if (munichRecords.length > 0) {
|
||||
const recordToUpdateId = munichRecords[0].id;
|
||||
console.log(`Updating business record with id: ${recordToUpdateId}`);
|
||||
const recordToUpdate = await openData.CBusinessRecord.getInstance(recordToUpdateId);
|
||||
if (recordToUpdate) {
|
||||
recordToUpdate.data.phone = "+49 89 999888";
|
||||
recordToUpdate.data.lastUpdate = new Date().toISOString();
|
||||
await recordToUpdate.save();
|
||||
console.log('Business record updated:', recordToUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Use Handelsregister to search for a specific company
|
||||
console.log('Using Handelsregister to search for a specific company...');
|
||||
const searchResults = await openData.handelsregister.searchCompany("Step Beyond GmbH", 20);
|
||||
if (searchResults && searchResults.length > 0) {
|
||||
const registrationData = searchResults[0].germanParsedRegistration;
|
||||
console.log('Retrieved registration data:', registrationData);
|
||||
|
||||
// Step 6: Retrieve detailed info and download files for the specific company
|
||||
console.log('Fetching detailed data for the identified company...');
|
||||
const detailedInfo = await openData.handelsregister.getSpecificCompany(registrationData);
|
||||
console.log('Detailed company data received:', detailedInfo);
|
||||
|
||||
// Optionally, save the downloaded files to a designated directory
|
||||
for (const downloadedFile of detailedInfo.files) {
|
||||
await downloadedFile.writeToDir('./output_files');
|
||||
console.log(`Downloaded file saved at: ${downloadedFile.path}`);
|
||||
}
|
||||
} else {
|
||||
console.log('No matching records found for detailed company data retrieval.');
|
||||
}
|
||||
|
||||
// Step 7: Validate and save a record to demonstrate error handling and validation
|
||||
console.log('Validating and saving a new test record...');
|
||||
await validateAndSaveRecord(openData);
|
||||
|
||||
} catch (error) {
|
||||
console.error('An error occurred during the full cycle pipeline operation:', error);
|
||||
} finally {
|
||||
// Final cleanup: Stop the OpenData module and release all resources
|
||||
console.log('Finalizing: stopping the OpenData module...');
|
||||
await openData.stop();
|
||||
console.log('Pipeline completed and all resources have been cleaned up.');
|
||||
}
|
||||
};
|
||||
|
||||
runFullCyclePipeline();
|
||||
```
|
||||
|
||||
In this example, the entire processing cycle is constructed to mimic a realistic scenario. The pipeline:
|
||||
• Starts by connecting to your database.
|
||||
• Imports extensive JSONL open data.
|
||||
• Creates, retrieves, updates, and deletes business records.
|
||||
• Interacts with the Handelsregister for advanced company-specific operations.
|
||||
• Implements robust error handling and validation routines, ensuring that each step is verifiable.
|
||||
• Finally, ensures that resources such as MongoDB connections and headless browser sessions are responsibly closed.
|
||||
|
||||
────────────────────────────────────────────
|
||||
### Final Thoughts on Module Integration
|
||||
|
||||
The @fin.cx/opendata library is designed to cater to a wide range of business data management needs. Whether you are an enterprise looking to integrate updated open data for decision-making or a developer looking to build data-rich applications with a focus on German companies, this library provides the tools and abstractions necessary to build robust solutions.
|
||||
|
||||
Every component—from the smart data management for business records to the advanced streaming and concurrent processing of JSONL files—is built with scalability and ease of use in mind. Integration with the Handelsregister via browser automation further extends its reach, providing dynamic access to official data sources in real-time.
|
||||
|
||||
As demonstrated in the examples above, each sub-component of the library is independent yet harmoniously integrated into a cohesive user experience. The use of ESM syntax throughout the module and the strict adherence to TypeScript definitions enhances reliability, maintainability, and the overall developer experience.
|
||||
|
||||
By following the usage scenarios provided in this documentation, you should now have a deep understanding of how to:
|
||||
• Set up your environment and initialize the OpenData instance.
|
||||
• Perform CRUD operations on business records.
|
||||
• Efficiently process thousands of records from external JSONL sources.
|
||||
• Integrate and automate Handelsregister interactions for detailed company data retrieval.
|
||||
• Combine all building blocks into advanced automated workflows that support large-scale enterprise applications.
|
||||
|
||||
Feel free to explore, extend, and customize these examples to suit your project’s unique requirements. The library is designed with extensibility in mind, and additional utility functions or integrations can be added based on your needs.
|
||||
|
||||
We encourage you to integrate these practices into your development processes, run the provided tests, and contribute to further enhancements that can benefit the entire community of developers working on open data management systems.
|
||||
|
||||
Happy coding and data integrating!
|
||||
|
||||
|
||||
|
||||
## License and Legal Information
|
||||
|
||||
This repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository.
|
||||
|
||||
**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.
|
||||
|
||||
### Trademarks
|
||||
|
||||
This project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.
|
||||
|
||||
### Company Information
|
||||
|
||||
Task Venture Capital GmbH
|
||||
Registered at District court Bremen HRB 35230 HB, Germany
|
||||
|
||||
For any legal inquiries or if you require further information, please contact us via email at hello@task.vc.
|
||||
|
||||
By using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.
|
||||
|
42
test/test.handelsregister.ts
Normal file
42
test/test.handelsregister.ts
Normal file
@ -0,0 +1,42 @@
|
||||
import { expect, expectAsync, tap } from '@push.rocks/tapbundle';
|
||||
import * as opendata from '../ts/index.js'
|
||||
|
||||
import { BusinessRecord } from '../ts/classes.businessrecord.js';
|
||||
|
||||
let testOpenDataInstance: opendata.OpenData;
|
||||
|
||||
tap.test('first test', async () => {
|
||||
testOpenDataInstance = new opendata.OpenData();
|
||||
expect(testOpenDataInstance).toBeInstanceOf(opendata.OpenData);
|
||||
});
|
||||
|
||||
tap.test('should start the instance', async () => {
|
||||
await testOpenDataInstance.start();
|
||||
});
|
||||
|
||||
const resultsSearch = tap.test('should get the data for a company', async () => {
|
||||
const result = await testOpenDataInstance.handelsregister.searchCompany('LADR', 20);
|
||||
console.log(result);
|
||||
return result;
|
||||
});
|
||||
|
||||
tap.test('should get the data for a specific company', async () => {
|
||||
let testCompany: BusinessRecord['data']['germanParsedRegistration'] = (await resultsSearch.testResultPromise)[0]['germanParsedRegistration'];
|
||||
console.log(`trying to find specific company with:`);
|
||||
console.log(testCompany);
|
||||
const result = await testOpenDataInstance.handelsregister.getSpecificCompany(testCompany);
|
||||
console.log(result);
|
||||
|
||||
await Promise.all(result.files.map(async (file) => {
|
||||
await file.writeToDir('./.nogit/testoutput');
|
||||
}));
|
||||
|
||||
|
||||
});
|
||||
|
||||
tap.test('should stop the instance', async () => {
|
||||
await testOpenDataInstance.stop();
|
||||
});
|
||||
|
||||
|
||||
tap.start()
|
11
test/test.ts
11
test/test.ts
@ -1,6 +1,8 @@
|
||||
import { expect, expectAsync, tap } from '@push.rocks/tapbundle';
|
||||
import * as opendata from '../ts/index.js'
|
||||
|
||||
import { BusinessRecord } from '../ts/classes.businessrecord.js';
|
||||
|
||||
let testOpenDataInstance: opendata.OpenData;
|
||||
|
||||
tap.test('first test', async () => {
|
||||
@ -12,4 +14,13 @@ tap.test('should start the instance', async () => {
|
||||
await testOpenDataInstance.start();
|
||||
})
|
||||
|
||||
tap.test('should build initial data', async () => {
|
||||
await testOpenDataInstance.buildInitialDb();
|
||||
});
|
||||
|
||||
tap.test('should stop the instance', async () => {
|
||||
await testOpenDataInstance.stop();
|
||||
});
|
||||
|
||||
|
||||
tap.start()
|
||||
|
@ -3,6 +3,6 @@
|
||||
*/
|
||||
export const commitinfo = {
|
||||
name: '@fin.cx/opendata',
|
||||
version: '1.1.1',
|
||||
description: 'A TypeScript-based library for accessing and managing open business data, specifically for German companies.'
|
||||
version: '1.5.3',
|
||||
description: 'A comprehensive TypeScript library that manages open business data for German companies by integrating MongoDB, processing JSONL bulk data, and automating browser interactions for Handelsregister data retrieval.'
|
||||
}
|
||||
|
@ -1,32 +1,67 @@
|
||||
import * as plugins from './plugins.js';
|
||||
|
||||
@plugins.smartdata.Manager()
|
||||
export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<BusinessRecord, BusinessRecord> {
|
||||
export class BusinessRecord extends plugins.smartdata.SmartDataDbDoc<
|
||||
BusinessRecord,
|
||||
BusinessRecord
|
||||
> {
|
||||
// STATIC
|
||||
public static getByGermanParsedRegistration = async (parsedGermanRegistrationArg: BusinessRecord['data']['germanParsedRegistration']) => {
|
||||
const businessRecords = await BusinessRecord.getInstance({
|
||||
data: {
|
||||
germanParsedRegistration: parsedGermanRegistrationArg,
|
||||
}
|
||||
});
|
||||
return businessRecords;
|
||||
};
|
||||
|
||||
|
||||
// INSTANCE
|
||||
@plugins.smartdata.unI()
|
||||
id: string;
|
||||
|
||||
@plugins.smartdata.svDb()
|
||||
data: {
|
||||
name?: string,
|
||||
address?: string,
|
||||
postalCode?: string,
|
||||
city?: string,
|
||||
country?: string,
|
||||
phone?: string,
|
||||
fax?: string,
|
||||
email?: string,
|
||||
website?: string,
|
||||
businessType?: string,
|
||||
registrationNumber?: string,
|
||||
registrationCourt?: string,
|
||||
legalForm?: string,
|
||||
managingDirectors?: string[],
|
||||
boardOfDirectors?: string[],
|
||||
supervisoryBoard?: string[],
|
||||
foundingDate?: string,
|
||||
capital?: string,
|
||||
purpose?: string,
|
||||
lastUpdate?: string
|
||||
name?: string;
|
||||
startDate?: string;
|
||||
endDate?: string;
|
||||
status?: 'active' | 'liquidating' | 'closed';
|
||||
address?: string;
|
||||
postalCode?: string;
|
||||
city?: string;
|
||||
country?: string;
|
||||
phone?: string;
|
||||
fax?: string;
|
||||
email?: string;
|
||||
website?: string;
|
||||
businessType?: string;
|
||||
registrationId?: string;
|
||||
germanParsedRegistration?: {
|
||||
court?: string;
|
||||
type?: 'HRA' | 'HRB' | 'GnR' | 'PR' | 'VR' | 'GsR';
|
||||
number?: string;
|
||||
};
|
||||
legalForm?:
|
||||
| 'GmbH'
|
||||
| 'GmbH & Co. KG'
|
||||
| 'AG'
|
||||
| 'LLC'
|
||||
| 'LLP'
|
||||
| 'GmbH & Co. KGaA'
|
||||
| 'GmbH & Co. KGaA, LLC';
|
||||
managingDirectors?: string[];
|
||||
boardOfDirectors?: string[];
|
||||
supervisoryBoard?: string[];
|
||||
foundingDate?: string;
|
||||
capital?: string;
|
||||
purpose?: string;
|
||||
lastUpdate?: string;
|
||||
} = {};
|
||||
|
||||
/**
|
||||
* validates the record against the Handelregister.
|
||||
*/
|
||||
public async validate() {
|
||||
if (!this.data.name) throw new Error('Name is required.');
|
||||
}
|
||||
}
|
@ -1,84 +0,0 @@
|
||||
import * as plugins from './plugins.js';
|
||||
import * as paths from './paths.js';
|
||||
import type { OpenData } from './classes.main.opendata.js';
|
||||
|
||||
export class GermanBusinessData {
|
||||
public openDataRef: OpenData;
|
||||
constructor(openDataRefArg: OpenData) {
|
||||
this.openDataRef = openDataRefArg;
|
||||
}
|
||||
|
||||
public async start() {
|
||||
await this.update();
|
||||
}
|
||||
public async stop() {}
|
||||
|
||||
public async update() {
|
||||
const done = plugins.smartpromise.defer();
|
||||
const promiseArray: Promise<any>[] = [];
|
||||
const dataUrl = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2';
|
||||
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
||||
if (!dataExists) {
|
||||
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
||||
} else {
|
||||
}
|
||||
|
||||
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrl);
|
||||
promiseArray
|
||||
.push
|
||||
// smartarchive.exportToFs(paths.germanBusinessDataDir, 'de_companies_ocdata.jsonl')
|
||||
();
|
||||
const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles();
|
||||
let totalRecordsCounter = 0;
|
||||
let nextRest: string = '';
|
||||
jsonlDataStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: plugins.smartfile.StreamFile, streamToolsArg) => {
|
||||
const readStream = await chunkArg.createReadStream();
|
||||
readStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: Buffer, streamToolsArg) => {
|
||||
const currentString = nextRest + chunkArg.toString();
|
||||
const lines = currentString.split('\n');
|
||||
nextRest = lines.pop();
|
||||
console.log(`Got another ${lines.length} records.`);
|
||||
for (const line of lines) {
|
||||
let entry: any;
|
||||
if (!line) continue;
|
||||
try {
|
||||
entry = JSON.parse(line);
|
||||
} catch (err) {
|
||||
console.log(line);
|
||||
await plugins.smartdelay.delayFor(10000);
|
||||
}
|
||||
if (!entry) continue;
|
||||
totalRecordsCounter++;
|
||||
if (totalRecordsCounter % 10000 === 0) console.log(`${totalRecordsCounter} total records.`);
|
||||
const businessRecord = new this.openDataRef.CBusinessRecord();
|
||||
businessRecord.data.name = entry.name;
|
||||
await businessRecord.save();
|
||||
// console.log(`stored ${businessRecord.data.name}`);
|
||||
}
|
||||
},
|
||||
finalFunction: async (streamToolsArg) => {
|
||||
if (!nextRest) return;
|
||||
JSON.parse(nextRest);
|
||||
}
|
||||
})
|
||||
);
|
||||
},
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async getBusinessRecordByName(nameArg: string) {
|
||||
const businessRecord = await this.openDataRef.CBusinessRecord.getInstance({
|
||||
data: {
|
||||
name: { $regex: `${nameArg}`, $options: "i" } as any,
|
||||
}
|
||||
});
|
||||
return businessRecord;
|
||||
}
|
||||
}
|
358
ts/classes.handelsregister.ts
Normal file
358
ts/classes.handelsregister.ts
Normal file
@ -0,0 +1,358 @@
|
||||
import type { BusinessRecord } from './classes.businessrecord.js';
|
||||
import type { OpenData } from './classes.main.opendata.js';
|
||||
import * as plugins from './plugins.js';
|
||||
import * as paths from './paths.js';
|
||||
|
||||
/**
|
||||
* the HandlesRegister exposed as a class
|
||||
*/
|
||||
export class HandelsRegister {
|
||||
private openDataRef: OpenData;
|
||||
private asyncExecutionStack = new plugins.lik.AsyncExecutionStack();
|
||||
private uniqueDowloadFolder = plugins.path.join(paths.downloadDir, plugins.smartunique.uniSimple());
|
||||
|
||||
// Puppeteer wrapper instance
|
||||
public smartbrowserInstance = new plugins.smartbrowser.SmartBrowser();
|
||||
|
||||
constructor(openDataRef: OpenData) {
|
||||
this.openDataRef = openDataRef;
|
||||
}
|
||||
|
||||
public async start() {
|
||||
// Start the browser
|
||||
await plugins.smartfile.fs.ensureDir(this.uniqueDowloadFolder);
|
||||
await this.smartbrowserInstance.start();
|
||||
}
|
||||
|
||||
public async stop() {
|
||||
// Stop the browser
|
||||
await plugins.smartfile.fs.remove(this.uniqueDowloadFolder);
|
||||
await this.smartbrowserInstance.stop();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new page and configures it to allow file downloads
|
||||
* to a predefined path.
|
||||
*/
|
||||
public getNewPage = async () => {
|
||||
const page = await this.smartbrowserInstance.headlessBrowser.newPage();
|
||||
|
||||
// 1) Create a DevTools session for this page
|
||||
const cdpSession = await page.target().createCDPSession();
|
||||
|
||||
// 2) Allow file downloads and set the download path
|
||||
await cdpSession.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath: this.uniqueDowloadFolder, // <-- Change this to your desired absolute path
|
||||
});
|
||||
|
||||
// Optionally set viewport and go to page
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.goto('https://www.handelsregister.de/');
|
||||
return page;
|
||||
};
|
||||
|
||||
private navigateToPage = async (
|
||||
pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page,
|
||||
pageNameArg: string
|
||||
) => {
|
||||
try {
|
||||
await pageArg.evaluate((pageNameArg2) => {
|
||||
const elements = Array.from(document.querySelectorAll('.ui-menuitem-text > span'));
|
||||
const targetElement = elements.find((el) => el.textContent?.trim() === pageNameArg2);
|
||||
if (targetElement) {
|
||||
(targetElement as HTMLElement).click();
|
||||
}
|
||||
}, pageNameArg);
|
||||
console.log(`Navigated to the ${pageNameArg} page successfully.`);
|
||||
} catch (error) {
|
||||
console.error(`Failed to navigate to the ${pageNameArg} page:`, error);
|
||||
}
|
||||
};
|
||||
|
||||
private waitForResults = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page) => {
|
||||
await pageArg
|
||||
.waitForSelector('#ergebnissForm\\:selectedSuchErgebnisFormTable_data', {
|
||||
timeout: 30000,
|
||||
})
|
||||
.catch(async (err) => {
|
||||
await pageArg.screenshot({ path: paths.downloadDir + '/error.png' });
|
||||
throw err;
|
||||
});
|
||||
|
||||
const businessRecords: BusinessRecord['data'][] = await pageArg.evaluate(() => {
|
||||
const rows = document.querySelectorAll(
|
||||
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data > tr'
|
||||
);
|
||||
const records: BusinessRecord['data'][] = [];
|
||||
|
||||
rows.forEach((row) => {
|
||||
const nameElement = row.querySelector('td.ui-panelgrid-cell span.marginLeft20');
|
||||
const cityElement = row.querySelector('td.ui-panelgrid-cell.sitzSuchErgebnisse span');
|
||||
const statusElement = row.querySelector('td.ui-panelgrid-cell span.verticalText');
|
||||
const registrationCourtElement = row.querySelector(
|
||||
'td.ui-panelgrid-cell.fontTableNameSize'
|
||||
);
|
||||
|
||||
const name = nameElement?.textContent?.trim();
|
||||
const city = cityElement?.textContent?.trim();
|
||||
const status = statusElement?.textContent?.trim();
|
||||
const registrationId = registrationCourtElement?.textContent?.trim();
|
||||
|
||||
// Push parsed data into records array
|
||||
records.push({
|
||||
name,
|
||||
city,
|
||||
registrationId,
|
||||
businessType: status,
|
||||
});
|
||||
});
|
||||
|
||||
return records;
|
||||
});
|
||||
return businessRecords;
|
||||
};
|
||||
|
||||
private clickFindButton = async (pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page, resultsLimitArg: number = 100) => {
|
||||
try {
|
||||
// Wait for the button with the text "Find" to appear
|
||||
await pageArg.waitForSelector('span.ui-button-text.ui-c', { timeout: 5000 });
|
||||
|
||||
// adjust to 100 results per page
|
||||
await pageArg.select('#form\\:ergebnisseProSeite_input', `${resultsLimitArg}`);
|
||||
|
||||
// Locate and click the button using its text
|
||||
await pageArg.evaluate(() => {
|
||||
const buttons = Array.from(document.querySelectorAll('span.ui-button-text.ui-c'));
|
||||
const targetButton = buttons.find((button) => button.textContent?.trim() === 'Find');
|
||||
if (targetButton) {
|
||||
const parentButton = targetButton.closest('button') || targetButton;
|
||||
(parentButton as HTMLElement).click();
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Find button clicked successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or click the "Find" button:', error);
|
||||
}
|
||||
};
|
||||
|
||||
private async downloadFile(
|
||||
pageArg: plugins.smartbrowser.smartpuppeteer.puppeteer.Page,
|
||||
typeArg: 'SI' | 'AD'
|
||||
) {
|
||||
// Trigger the file download by clicking on the relevant link
|
||||
await pageArg.evaluate((typeArg2) => {
|
||||
// Locate the table body
|
||||
const tableBody = document.querySelector(
|
||||
'#ergebnissForm\\:selectedSuchErgebnisFormTable_data'
|
||||
);
|
||||
if (!tableBody) {
|
||||
throw new Error('Table body not found');
|
||||
}
|
||||
|
||||
// Locate the first row
|
||||
const firstRow = tableBody.querySelector('tr:nth-child(1)');
|
||||
if (!firstRow) {
|
||||
throw new Error('First row not found');
|
||||
}
|
||||
|
||||
// Locate the last cell in the first row
|
||||
const lastCell = firstRow.querySelector('td:last-child');
|
||||
if (!lastCell) {
|
||||
throw new Error('Last cell not found in the first row');
|
||||
}
|
||||
|
||||
// Locate the download links
|
||||
const adLink = lastCell.querySelector('a:first-of-type');
|
||||
const siLink = lastCell.querySelector('a:last-of-type');
|
||||
if (!siLink) {
|
||||
throw new Error('SI link not found in the last cell');
|
||||
}
|
||||
|
||||
// Simulate a click on the last <a> element
|
||||
switch (typeArg2) {
|
||||
case 'AD':
|
||||
(adLink as HTMLElement).click();
|
||||
break;
|
||||
case 'SI':
|
||||
(siLink as HTMLElement).click();
|
||||
break;
|
||||
default:
|
||||
throw new Error('Invalid file type');
|
||||
}
|
||||
}, typeArg);
|
||||
|
||||
|
||||
await plugins.smartfile.fs.waitForFileToBeReady(this.uniqueDowloadFolder);
|
||||
|
||||
const files = await plugins.smartfile.fs.fileTreeToObject(this.uniqueDowloadFolder, '**/*');
|
||||
const file = files[0];
|
||||
|
||||
// lets clear the folder for the next download
|
||||
await plugins.smartfile.fs.ensureEmptyDir(this.uniqueDowloadFolder);
|
||||
switch (typeArg) {
|
||||
case 'AD':
|
||||
await file.rename(`ad.pdf`);
|
||||
break;
|
||||
case 'SI':
|
||||
await file.rename(`si.xml`);
|
||||
break;
|
||||
break;
|
||||
}
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to parse the German registration string
|
||||
*/
|
||||
private async parseGermanRegistration(
|
||||
input: string
|
||||
): Promise<BusinessRecord['data']['germanParsedRegistration']> {
|
||||
// e.g. District court Berlin (Charlottenburg) HRB 123456
|
||||
const regex =
|
||||
/District court (\p{L}[\p{L}\s-]*?(?:\s*\([\p{L}\s-]+\))?)\s+(HRA|HRB|GnR|VR|PR|GsR)\s+(\d+)/u;
|
||||
const match = input.match(regex);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
court: match[1],
|
||||
type: match[2] as 'HRA' | 'HRB', // Adjust if needed
|
||||
number: match[3],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for a company by name and return basic info
|
||||
*/
|
||||
public async searchCompany(companyNameArg: string, resultsLimitArg: number = 100) {
|
||||
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
|
||||
try {
|
||||
// Wait for the textarea to appear
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// Enter text into the textarea
|
||||
const inputText = companyNameArg;
|
||||
await page.evaluate((text) => {
|
||||
const textarea = document.querySelector<HTMLTextAreaElement>('#form\\:schlagwoerter');
|
||||
if (textarea) {
|
||||
textarea.value = text; // Set the value
|
||||
// Trigger the change event manually if required
|
||||
const event = new Event('change', { bubbles: true });
|
||||
textarea.dispatchEvent(event);
|
||||
}
|
||||
}, inputText);
|
||||
|
||||
console.log('Text entered successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or enter text into the textarea:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
// Wait for the radio button's label to appear
|
||||
await page.waitForSelector('label[for="form:schlagwortOptionen:0"]', { timeout: 5000 });
|
||||
|
||||
// Click the label to select the radio button
|
||||
await page.evaluate(() => {
|
||||
const label = document.querySelector<HTMLLabelElement>(
|
||||
'label[for="form:schlagwortOptionen:0"]'
|
||||
);
|
||||
if (label) {
|
||||
label.click();
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Radio button clicked successfully!');
|
||||
} catch (error) {
|
||||
console.error('Failed to find or click the radio button:', error);
|
||||
}
|
||||
|
||||
await this.clickFindButton(page, resultsLimitArg);
|
||||
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
|
||||
// Parse out the registration info
|
||||
for (const record of businessRecords) {
|
||||
if (record.registrationId) {
|
||||
record.germanParsedRegistration = await this.parseGermanRegistration(
|
||||
record.registrationId
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await page.close();
|
||||
return businessRecords;
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for a specific company (known register type/number/court),
|
||||
* then click on an element that triggers a file download.
|
||||
*/
|
||||
public async getSpecificCompany(companyArg: BusinessRecord['data']['germanParsedRegistration']) {
|
||||
return this.asyncExecutionStack.getExclusiveExecutionSlot(async () => {
|
||||
const page = await this.getNewPage();
|
||||
await this.navigateToPage(page, 'Normal search');
|
||||
await page.waitForSelector('#form\\:schlagwoerter', { timeout: 5000 });
|
||||
|
||||
// 1) Type of Register (e.g. HRB, HRA, etc.)
|
||||
await page.waitForSelector('#form\\:registerArt_label');
|
||||
await page.click('#form\\:registerArt_label');
|
||||
await page.waitForSelector('#form\\:registerArt_items');
|
||||
await page.evaluate((type) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registerArt_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === type);
|
||||
(targetOption as HTMLElement)?.click();
|
||||
}, companyArg.type);
|
||||
|
||||
// 2) Register number
|
||||
await page.waitForSelector('#form\\:registerNummer');
|
||||
await page.type('#form\\:registerNummer', companyArg.number);
|
||||
|
||||
// 3) Register court
|
||||
await page.waitForSelector('#form\\:registergericht_label');
|
||||
await page.click('#form\\:registergericht_label');
|
||||
await page.waitForSelector('#form\\:registergericht_items');
|
||||
await page.evaluate((court) => {
|
||||
const options = Array.from(document.querySelectorAll('#form\\:registergericht_items li'));
|
||||
const targetOption = options.find((option) => option.textContent?.trim() === court);
|
||||
(targetOption as HTMLElement)?.click();
|
||||
}, companyArg.court);
|
||||
|
||||
// Click 'Find'
|
||||
await this.clickFindButton(page);
|
||||
|
||||
// Optionally grab the results, just for logging
|
||||
const businessRecords = await this.waitForResults(page);
|
||||
console.log(businessRecords);
|
||||
|
||||
const files: plugins.smartfile.SmartFile[] = [];
|
||||
|
||||
// download files
|
||||
files.push(await this.downloadFile(page, 'SI'));
|
||||
files.push(await this.downloadFile(page, 'AD'));
|
||||
|
||||
// At this point, the file should have been downloaded automatically
|
||||
// to the path specified by `Page.setDownloadBehavior`
|
||||
await page.close();
|
||||
|
||||
return {
|
||||
businessRecords,
|
||||
files,
|
||||
};
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
/**
|
||||
* get specific company by full name
|
||||
*/
|
||||
public async getSpecificCompanyByName(companyNameArg: string) {
|
||||
const businessRecords = await this.searchCompany(companyNameArg, 1);
|
||||
const result = this.getSpecificCompany(businessRecords[0].germanParsedRegistration);
|
||||
return result;
|
||||
}
|
||||
}
|
111
ts/classes.jsonldata.ts
Normal file
111
ts/classes.jsonldata.ts
Normal file
@ -0,0 +1,111 @@
|
||||
import * as plugins from './plugins.js';
|
||||
import * as paths from './paths.js';
|
||||
import type { OpenData } from './classes.main.opendata.js';
|
||||
|
||||
export type SeedEntryType = {
|
||||
all_attributes: {
|
||||
_registerArt: string;
|
||||
_registerNummer: string;
|
||||
additional_data: {
|
||||
AD: boolean;
|
||||
CD: boolean;
|
||||
DK: boolean;
|
||||
HD: boolean;
|
||||
SI: boolean;
|
||||
UT: boolean;
|
||||
VÖ: boolean;
|
||||
};
|
||||
federal_state: string;
|
||||
native_company_number: string;
|
||||
registered_office: string;
|
||||
registrar: string;
|
||||
};
|
||||
company_number: string;
|
||||
current_status: string;
|
||||
jurisdiction_code: string;
|
||||
name: string;
|
||||
officers: {
|
||||
name: string;
|
||||
other_attributes: {
|
||||
city: string;
|
||||
firstname: string;
|
||||
flag: string;
|
||||
lastname: string;
|
||||
};
|
||||
position: string;
|
||||
start_date: string; // ISO 8601 date string
|
||||
type: string;
|
||||
}[];
|
||||
registered_address: string;
|
||||
retrieved_at: string; // ISO 8601 date string
|
||||
};
|
||||
|
||||
export class JsonlDataProcessor<T> {
|
||||
public forEachFunction: (entryArg: T) => Promise<void>;
|
||||
constructor(forEachFunctionArg: typeof this.forEachFunction) {
|
||||
this.forEachFunction = forEachFunctionArg;
|
||||
}
|
||||
|
||||
// TODO: define a mapper as argument instead of hard-coding it
|
||||
public async processDataFromUrl(
|
||||
dataUrlArg = 'https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2'
|
||||
) {
|
||||
const done = plugins.smartpromise.defer();
|
||||
const dataExists = await plugins.smartfile.fs.isDirectory(paths.germanBusinessDataDir);
|
||||
if (!dataExists) {
|
||||
await plugins.smartfile.fs.ensureDir(paths.germanBusinessDataDir);
|
||||
} else {
|
||||
}
|
||||
|
||||
const smartarchive = await plugins.smartarchive.SmartArchive.fromArchiveUrl(dataUrlArg);
|
||||
const jsonlDataStream = await smartarchive.exportToStreamOfStreamFiles();
|
||||
let totalRecordsCounter = 0;
|
||||
let nextRest: string = '';
|
||||
jsonlDataStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: plugins.smartfile.StreamFile, streamToolsArg) => {
|
||||
const readStream = await chunkArg.createReadStream();
|
||||
readStream.pipe(
|
||||
new plugins.smartstream.SmartDuplex({
|
||||
objectMode: true,
|
||||
writeFunction: async (chunkArg: Buffer, streamToolsArg) => {
|
||||
const currentString = nextRest + chunkArg.toString();
|
||||
const lines = currentString.split('\n');
|
||||
nextRest = lines.pop();
|
||||
console.log(`Got another ${lines.length} records.`);
|
||||
const concurrentProcessor = new plugins.smartarray.ConcurrentProcessor<string>(
|
||||
async (line) => {
|
||||
let entry: T;
|
||||
if (!line) return;
|
||||
try {
|
||||
entry = JSON.parse(line);
|
||||
} catch (err) {
|
||||
console.log(line);
|
||||
await plugins.smartdelay.delayFor(10000);
|
||||
}
|
||||
if (!entry) return;
|
||||
totalRecordsCounter++;
|
||||
if (totalRecordsCounter % 10000 === 0)
|
||||
console.log(`${totalRecordsCounter} total records.`);
|
||||
await this.forEachFunction(entry);
|
||||
},
|
||||
1000
|
||||
);
|
||||
await concurrentProcessor.process(lines);
|
||||
},
|
||||
finalFunction: async (streamToolsArg) => {
|
||||
console.log(`finished processing ${totalRecordsCounter} records.`);
|
||||
if (nextRest) {
|
||||
JSON.parse(nextRest);
|
||||
};
|
||||
done.resolve();
|
||||
},
|
||||
})
|
||||
);
|
||||
},
|
||||
})
|
||||
);
|
||||
await done.promise;
|
||||
}
|
||||
}
|
@ -1,25 +1,61 @@
|
||||
import { BusinessRecord } from './classes.businessrecord.js';
|
||||
import { GermanBusinessData } from './classes.germanbusinessdata.js';
|
||||
import { HandelsRegister } from './classes.handelsregister.js';
|
||||
import { JsonlDataProcessor, type SeedEntryType } from './classes.jsonldata.js';
|
||||
import * as paths from './paths.js';
|
||||
import * as plugins from './plugins.js';
|
||||
|
||||
export class OpenData {
|
||||
db: plugins.smartdata.SmartdataDb;
|
||||
germanBusinesses: GermanBusinessData;
|
||||
public db: plugins.smartdata.SmartdataDb;
|
||||
private serviceQenv = new plugins.qenv.Qenv(paths.packageDir, paths.nogitDir);
|
||||
|
||||
public jsonLDataProcessor: JsonlDataProcessor<SeedEntryType>;
|
||||
public handelsregister: HandelsRegister;
|
||||
|
||||
public CBusinessRecord = plugins.smartdata.setDefaultManagerForDoc(this, BusinessRecord);
|
||||
|
||||
public async start() {
|
||||
this.db = new plugins.smartdata.SmartdataDb({
|
||||
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
|
||||
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
|
||||
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
|
||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||
mongoDbUrl: await this.serviceQenv.getEnvVarOnDemand('MONGODB_URL'),
|
||||
mongoDbName: await this.serviceQenv.getEnvVarOnDemand('MONGODB_NAME'),
|
||||
mongoDbUser: await this.serviceQenv.getEnvVarOnDemand('MONGODB_USER'),
|
||||
mongoDbPass: await this.serviceQenv.getEnvVarOnDemand('MONGODB_PASS'),
|
||||
});
|
||||
await this.db.init();
|
||||
this.germanBusinesses = new GermanBusinessData(this);
|
||||
await this.germanBusinesses.start();
|
||||
this.jsonLDataProcessor = new JsonlDataProcessor(async (entryArg) => {
|
||||
const businessRecord = new this.CBusinessRecord();
|
||||
businessRecord.id = await this.CBusinessRecord.getNewId();
|
||||
businessRecord.data.name = entryArg.name;
|
||||
businessRecord.data.germanParsedRegistration = {
|
||||
court: entryArg.all_attributes.registered_office,
|
||||
number: entryArg.all_attributes._registerNummer,
|
||||
type: entryArg.all_attributes._registerArt as 'HRA' | 'HRB',
|
||||
};
|
||||
await businessRecord.save();
|
||||
});
|
||||
this.handelsregister = new HandelsRegister(this);
|
||||
await this.handelsregister.start();
|
||||
}
|
||||
|
||||
public async buildInitialDb() {
|
||||
await this.jsonLDataProcessor.processDataFromUrl();
|
||||
}
|
||||
|
||||
public async slowValidateDb() {
|
||||
|
||||
}
|
||||
|
||||
public async validateSearchByName() {
|
||||
|
||||
}
|
||||
|
||||
public async searchDbByBusinessNameAndPostalCode(businessNameArg: string, postalCodeArg: string) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public async stop() {
|
||||
await this.db.close();
|
||||
await this.handelsregister.stop();
|
||||
}
|
||||
public async stop() {}
|
||||
}
|
@ -8,4 +8,8 @@ export const packageDir = plugins.path.join(
|
||||
export const nogitDir = plugins.path.join(packageDir, './.nogit/');
|
||||
plugins.smartfile.fs.ensureDirSync(nogitDir);
|
||||
|
||||
export const downloadDir = plugins.path.join(nogitDir, 'downloads');
|
||||
plugins.smartfile.fs.ensureDirSync(downloadDir);
|
||||
|
||||
|
||||
export const germanBusinessDataDir = plugins.path.join(nogitDir, 'germanbusinessdata');
|
@ -6,8 +6,11 @@ export {
|
||||
}
|
||||
|
||||
// @push.rocks scope
|
||||
import * as lik from '@push.rocks/lik';
|
||||
import * as qenv from '@push.rocks/qenv';
|
||||
import * as smartarchive from '@push.rocks/smartarchive';
|
||||
import * as smartarray from '@push.rocks/smartarray';
|
||||
import * as smartbrowser from '@push.rocks/smartbrowser';
|
||||
import * as smartdata from '@push.rocks/smartdata';
|
||||
import * as smartdelay from '@push.rocks/smartdelay';
|
||||
import * as smartfile from '@push.rocks/smartfile';
|
||||
@ -15,10 +18,15 @@ import * as smartpath from '@push.rocks/smartpath';
|
||||
import * as smartpromise from '@push.rocks/smartpromise';
|
||||
import * as smartrequest from '@push.rocks/smartrequest';
|
||||
import * as smartstream from '@push.rocks/smartstream';
|
||||
import * as smartunique from '@push.rocks/smartunique';
|
||||
import * as smartxml from '@push.rocks/smartxml';
|
||||
|
||||
export {
|
||||
lik,
|
||||
qenv,
|
||||
smartarchive,
|
||||
smartarray,
|
||||
smartbrowser,
|
||||
smartdata,
|
||||
smartdelay,
|
||||
smartfile,
|
||||
@ -26,4 +34,13 @@ export {
|
||||
smartpromise,
|
||||
smartrequest,
|
||||
smartstream,
|
||||
smartunique,
|
||||
smartxml,
|
||||
}
|
||||
|
||||
// @tsclass scope
|
||||
import * as tsclass from '@tsclass/tsclass';
|
||||
|
||||
export {
|
||||
tsclass,
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user