243 lines
7.8 KiB
JavaScript
243 lines
7.8 KiB
JavaScript
import { spawnSync } from 'node:child_process';
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
|
|
const packageRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
|
const sourceRoot = path.join(packageRoot, '.nogit', 'fee-schedules-sources');
|
|
const onlyGitDir = path.join(packageRoot, '.onlygit');
|
|
const manifestPath = path.join(onlyGitDir, 'fee-schedules.sources.json');
|
|
const outPath = path.join(onlyGitDir, 'fee-schedules.json');
|
|
const generatedAt = new Date().toISOString().slice(0, 10);
|
|
|
|
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
|
|
const federalSources = manifest.generationSources.federal;
|
|
const externalSources = manifest.generationSources.external;
|
|
|
|
fs.mkdirSync(sourceRoot, { recursive: true });
|
|
fs.mkdirSync(onlyGitDir, { recursive: true });
|
|
|
|
const decodeEntities = (input) => input
|
|
.replace(/ | /g, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/§/g, '§')
|
|
.replace(/Ä/g, 'Ä')
|
|
.replace(/Ö/g, 'Ö')
|
|
.replace(/Ü/g, 'Ü')
|
|
.replace(/ä/g, 'ä')
|
|
.replace(/ö/g, 'ö')
|
|
.replace(/ü/g, 'ü')
|
|
.replace(/ß/g, 'ß');
|
|
|
|
const cleanXmlText = (input) => decodeEntities(input)
|
|
.replace(/<BR\s*\/?\s*>/gi, ' ')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
const extractFirst = (xml, tag) => {
|
|
const match = xml.match(new RegExp(`<${tag}(?:\\s[^>]*)?>([\\s\\S]*?)<\\/${tag}>`));
|
|
return match ? cleanXmlText(match[1]) : undefined;
|
|
};
|
|
|
|
const isCode = (value) => /^(?:\d{1,6}[a-z]?|[A-Z]\d{1,5}|[A-Z]{1,3}\s?\d{1,5})$/.test(value);
|
|
const hasNumericValue = (value) => /\d/.test(value) && /^[\d\s.,€–\-]+$/.test(value);
|
|
|
|
const parseAmount = (value) => {
|
|
const numeric = value.match(/\d+(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?/g)?.at(-1);
|
|
if (!numeric) {
|
|
return undefined;
|
|
}
|
|
|
|
return Number(numeric.replace(/\./g, '').replace(',', '.'));
|
|
};
|
|
|
|
const getRowType = (cells, mode) => {
|
|
if (mode === 'code' && isCode(cells[0])) {
|
|
return 'fee-entry';
|
|
}
|
|
|
|
if (mode === 'table' && cells.length >= 2 && cells.filter(hasNumericValue).length >= 2) {
|
|
return 'fee-table-row';
|
|
}
|
|
|
|
return undefined;
|
|
};
|
|
|
|
const downloadFederalXml = async (source) => {
|
|
const zipUrl = `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`;
|
|
const zipPath = path.join(sourceRoot, `${source.sourcePath}.zip`);
|
|
const extractDir = path.join(sourceRoot, source.sourcePath);
|
|
|
|
fs.mkdirSync(extractDir, { recursive: true });
|
|
|
|
const response = await fetch(zipUrl);
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to download ${zipUrl}: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
fs.writeFileSync(zipPath, Buffer.from(await response.arrayBuffer()));
|
|
|
|
const unzip = spawnSync('/usr/bin/unzip', ['-o', '-q', zipPath, '-d', extractDir], {
|
|
stdio: 'pipe',
|
|
encoding: 'utf8',
|
|
});
|
|
|
|
if (unzip.status !== 0) {
|
|
throw new Error(`Failed to unzip ${zipPath}: ${unzip.stderr || unzip.stdout}`);
|
|
}
|
|
|
|
const xmlFiles = fs.readdirSync(extractDir).filter((entry) => entry.endsWith('.xml'));
|
|
if (xmlFiles.length !== 1) {
|
|
throw new Error(`Expected one XML file for ${source.sourcePath}, found ${xmlFiles.length}`);
|
|
}
|
|
|
|
return path.join(extractDir, xmlFiles[0]);
|
|
};
|
|
|
|
const getMetadata = (firstNorm) => {
|
|
const statusNotes = [...firstNorm.matchAll(/<standkommentar>([\s\S]*?)<\/standkommentar>/g)]
|
|
.map((match) => cleanXmlText(match[1]));
|
|
|
|
return {
|
|
officialAbbreviation: extractFirst(firstNorm, 'amtabk'),
|
|
legalAbbreviation: extractFirst(firstNorm, 'jurabk'),
|
|
title: extractFirst(firstNorm, 'langue'),
|
|
issuedAt: extractFirst(firstNorm, 'ausfertigung-datum'),
|
|
statusNotes,
|
|
};
|
|
};
|
|
|
|
const parseFederalSource = async (source) => {
|
|
const xmlPath = await downloadFederalXml(source);
|
|
const xml = fs.readFileSync(xmlPath, 'utf8');
|
|
const sourceFileName = path.basename(xmlPath);
|
|
const firstNorm = xml.match(/<norm[\s\S]*?<\/norm>/)?.[0] || '';
|
|
const metadata = getMetadata(firstNorm);
|
|
const norms = [...xml.matchAll(/<norm\b[^>]*doknr="([^"]+)"[^>]*>([\s\S]*?)<\/norm>/g)];
|
|
|
|
const ruleSections = norms.map((match, index) => {
|
|
const sourceNormId = match[1];
|
|
const normXml = match[2];
|
|
const reference = extractFirst(normXml, 'enbez') || metadata.officialAbbreviation || source.scheduleId;
|
|
const title = extractFirst(normXml, 'titel');
|
|
const contentMatch = normXml.match(/<Content>([\s\S]*?)<\/Content>/);
|
|
const text = contentMatch ? cleanXmlText(contentMatch[1]) : '';
|
|
|
|
return {
|
|
id: `${source.scheduleId}-section-${index + 1}`,
|
|
scheduleId: source.scheduleId,
|
|
sourceNormId,
|
|
reference,
|
|
title,
|
|
text,
|
|
};
|
|
}).filter((section) => section.text);
|
|
|
|
const feeRows = [];
|
|
let rowIndex = 0;
|
|
|
|
for (const rowMatch of xml.matchAll(/<row[\s\S]*?<\/row>/g)) {
|
|
const cells = [...rowMatch[0].matchAll(/<entry[^>]*>([\s\S]*?)<\/entry>/g)]
|
|
.map((entryMatch) => cleanXmlText(entryMatch[1]))
|
|
.filter(Boolean);
|
|
|
|
if (cells.length < 2) {
|
|
continue;
|
|
}
|
|
|
|
const rowType = getRowType(cells, source.rowMode);
|
|
if (!rowType) {
|
|
continue;
|
|
}
|
|
|
|
rowIndex += 1;
|
|
const thirdCell = cells[2];
|
|
const lastCell = cells.at(-1) || '';
|
|
const points = rowType === 'fee-entry' && thirdCell && /^\d+$/.test(thirdCell)
|
|
? Number(thirdCell)
|
|
: undefined;
|
|
const amountEur = lastCell.includes('€') || (rowType === 'fee-entry' && cells.length >= 4)
|
|
? parseAmount(lastCell)
|
|
: undefined;
|
|
|
|
feeRows.push({
|
|
id: `${source.scheduleId}-row-${rowIndex}`,
|
|
scheduleId: source.scheduleId,
|
|
rowType,
|
|
rowIndex,
|
|
code: rowType === 'fee-entry' ? cells[0] : undefined,
|
|
description: rowType === 'fee-entry' ? cells[1] : undefined,
|
|
cells,
|
|
points,
|
|
amountEur,
|
|
sourceNormId: undefined,
|
|
});
|
|
}
|
|
|
|
return {
|
|
scheduleId: source.scheduleId,
|
|
dataStatus: feeRows.length ? 'federal-law-fee-data' : 'federal-law-rules-only',
|
|
edition: metadata.statusNotes.join(' ') || `Retrieved ${generatedAt}`,
|
|
source: {
|
|
name: 'Gesetze im Internet',
|
|
url: `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`,
|
|
pageUrl: `https://www.gesetze-im-internet.de/${source.sourcePath}/`,
|
|
retrievedAt: generatedAt,
|
|
sourceFileName,
|
|
officialAbbreviation: metadata.officialAbbreviation,
|
|
legalAbbreviation: metadata.legalAbbreviation,
|
|
title: metadata.title,
|
|
issuedAt: metadata.issuedAt,
|
|
statusNotes: metadata.statusNotes,
|
|
},
|
|
feeRows,
|
|
ruleSections,
|
|
notes: feeRows.length
|
|
? ['Generated from official federal XML table rows. Original row cells are preserved in order.']
|
|
: ['No separate fee table rows were present in the federal XML. Fee rules are represented as rule sections.'],
|
|
};
|
|
};
|
|
|
|
const getExternalSourceData = (source) => ({
|
|
scheduleId: source.scheduleId,
|
|
dataStatus: 'external-source-pending',
|
|
edition: `Source identified ${generatedAt}`,
|
|
source: {
|
|
name: source.sourceName,
|
|
url: source.sourceUrl,
|
|
retrievedAt: generatedAt,
|
|
},
|
|
feeRows: [],
|
|
ruleSections: [],
|
|
notes: [source.note],
|
|
});
|
|
|
|
const scheduleData = [];
|
|
|
|
for (const source of federalSources) {
|
|
const data = await parseFederalSource(source);
|
|
scheduleData.push(data);
|
|
console.log(`${source.fileName}: ${data.dataStatus}, rows=${data.feeRows.length}, sections=${data.ruleSections.length}`);
|
|
}
|
|
|
|
for (const source of externalSources) {
|
|
const data = getExternalSourceData(source);
|
|
scheduleData.push(data);
|
|
console.log(`${source.fileName}: ${data.dataStatus}, rows=0, sections=0`);
|
|
}
|
|
|
|
const payload = {
|
|
schemaVersion: 1,
|
|
generatedAt,
|
|
catalogs: manifest.catalogs,
|
|
scheduleData,
|
|
};
|
|
|
|
fs.writeFileSync(outPath, `${JSON.stringify(payload, null, 2)}\n`);
|
|
console.log(`wrote ${path.relative(packageRoot, outPath)}`);
|