Files
fee-schedules/scripts/generate-germany.mjs
T

243 lines
7.8 KiB
JavaScript

import { spawnSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const packageRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
const sourceRoot = path.join(packageRoot, '.nogit', 'fee-schedules-sources');
const onlyGitDir = path.join(packageRoot, '.onlygit');
const manifestPath = path.join(onlyGitDir, 'fee-schedules.sources.json');
const outPath = path.join(onlyGitDir, 'fee-schedules.json');
const generatedAt = new Date().toISOString().slice(0, 10);
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
const federalSources = manifest.generationSources.federal;
const externalSources = manifest.generationSources.external;
fs.mkdirSync(sourceRoot, { recursive: true });
fs.mkdirSync(onlyGitDir, { recursive: true });
const decodeEntities = (input) => input
.replace(/ | /g, ' ')
.replace(/&/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#167;/g, '§')
.replace(/&#196;/g, 'Ä')
.replace(/&#214;/g, 'Ö')
.replace(/&#220;/g, 'Ü')
.replace(/&#228;/g, 'ä')
.replace(/&#246;/g, 'ö')
.replace(/&#252;/g, 'ü')
.replace(/&#223;/g, 'ß');
const cleanXmlText = (input) => decodeEntities(input)
.replace(/<BR\s*\/?\s*>/gi, ' ')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
const extractFirst = (xml, tag) => {
const match = xml.match(new RegExp(`<${tag}(?:\\s[^>]*)?>([\\s\\S]*?)<\\/${tag}>`));
return match ? cleanXmlText(match[1]) : undefined;
};
const isCode = (value) => /^(?:\d{1,6}[a-z]?|[A-Z]\d{1,5}|[A-Z]{1,3}\s?\d{1,5})$/.test(value);
const hasNumericValue = (value) => /\d/.test(value) && /^[\d\s.,€–\-]+$/.test(value);
const parseAmount = (value) => {
const numeric = value.match(/\d+(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?/g)?.at(-1);
if (!numeric) {
return undefined;
}
return Number(numeric.replace(/\./g, '').replace(',', '.'));
};
const getRowType = (cells, mode) => {
if (mode === 'code' && isCode(cells[0])) {
return 'fee-entry';
}
if (mode === 'table' && cells.length >= 2 && cells.filter(hasNumericValue).length >= 2) {
return 'fee-table-row';
}
return undefined;
};
const downloadFederalXml = async (source) => {
const zipUrl = `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`;
const zipPath = path.join(sourceRoot, `${source.sourcePath}.zip`);
const extractDir = path.join(sourceRoot, source.sourcePath);
fs.mkdirSync(extractDir, { recursive: true });
const response = await fetch(zipUrl);
if (!response.ok) {
throw new Error(`Failed to download ${zipUrl}: ${response.status} ${response.statusText}`);
}
fs.writeFileSync(zipPath, Buffer.from(await response.arrayBuffer()));
const unzip = spawnSync('/usr/bin/unzip', ['-o', '-q', zipPath, '-d', extractDir], {
stdio: 'pipe',
encoding: 'utf8',
});
if (unzip.status !== 0) {
throw new Error(`Failed to unzip ${zipPath}: ${unzip.stderr || unzip.stdout}`);
}
const xmlFiles = fs.readdirSync(extractDir).filter((entry) => entry.endsWith('.xml'));
if (xmlFiles.length !== 1) {
throw new Error(`Expected one XML file for ${source.sourcePath}, found ${xmlFiles.length}`);
}
return path.join(extractDir, xmlFiles[0]);
};
const getMetadata = (firstNorm) => {
const statusNotes = [...firstNorm.matchAll(/<standkommentar>([\s\S]*?)<\/standkommentar>/g)]
.map((match) => cleanXmlText(match[1]));
return {
officialAbbreviation: extractFirst(firstNorm, 'amtabk'),
legalAbbreviation: extractFirst(firstNorm, 'jurabk'),
title: extractFirst(firstNorm, 'langue'),
issuedAt: extractFirst(firstNorm, 'ausfertigung-datum'),
statusNotes,
};
};
const parseFederalSource = async (source) => {
const xmlPath = await downloadFederalXml(source);
const xml = fs.readFileSync(xmlPath, 'utf8');
const sourceFileName = path.basename(xmlPath);
const firstNorm = xml.match(/<norm[\s\S]*?<\/norm>/)?.[0] || '';
const metadata = getMetadata(firstNorm);
const norms = [...xml.matchAll(/<norm\b[^>]*doknr="([^"]+)"[^>]*>([\s\S]*?)<\/norm>/g)];
const ruleSections = norms.map((match, index) => {
const sourceNormId = match[1];
const normXml = match[2];
const reference = extractFirst(normXml, 'enbez') || metadata.officialAbbreviation || source.scheduleId;
const title = extractFirst(normXml, 'titel');
const contentMatch = normXml.match(/<Content>([\s\S]*?)<\/Content>/);
const text = contentMatch ? cleanXmlText(contentMatch[1]) : '';
return {
id: `${source.scheduleId}-section-${index + 1}`,
scheduleId: source.scheduleId,
sourceNormId,
reference,
title,
text,
};
}).filter((section) => section.text);
const feeRows = [];
let rowIndex = 0;
for (const rowMatch of xml.matchAll(/<row[\s\S]*?<\/row>/g)) {
const cells = [...rowMatch[0].matchAll(/<entry[^>]*>([\s\S]*?)<\/entry>/g)]
.map((entryMatch) => cleanXmlText(entryMatch[1]))
.filter(Boolean);
if (cells.length < 2) {
continue;
}
const rowType = getRowType(cells, source.rowMode);
if (!rowType) {
continue;
}
rowIndex += 1;
const thirdCell = cells[2];
const lastCell = cells.at(-1) || '';
const points = rowType === 'fee-entry' && thirdCell && /^\d+$/.test(thirdCell)
? Number(thirdCell)
: undefined;
const amountEur = lastCell.includes('€') || (rowType === 'fee-entry' && cells.length >= 4)
? parseAmount(lastCell)
: undefined;
feeRows.push({
id: `${source.scheduleId}-row-${rowIndex}`,
scheduleId: source.scheduleId,
rowType,
rowIndex,
code: rowType === 'fee-entry' ? cells[0] : undefined,
description: rowType === 'fee-entry' ? cells[1] : undefined,
cells,
points,
amountEur,
sourceNormId: undefined,
});
}
return {
scheduleId: source.scheduleId,
dataStatus: feeRows.length ? 'federal-law-fee-data' : 'federal-law-rules-only',
edition: metadata.statusNotes.join(' ') || `Retrieved ${generatedAt}`,
source: {
name: 'Gesetze im Internet',
url: `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`,
pageUrl: `https://www.gesetze-im-internet.de/${source.sourcePath}/`,
retrievedAt: generatedAt,
sourceFileName,
officialAbbreviation: metadata.officialAbbreviation,
legalAbbreviation: metadata.legalAbbreviation,
title: metadata.title,
issuedAt: metadata.issuedAt,
statusNotes: metadata.statusNotes,
},
feeRows,
ruleSections,
notes: feeRows.length
? ['Generated from official federal XML table rows. Original row cells are preserved in order.']
: ['No separate fee table rows were present in the federal XML. Fee rules are represented as rule sections.'],
};
};
const getExternalSourceData = (source) => ({
scheduleId: source.scheduleId,
dataStatus: 'external-source-pending',
edition: `Source identified ${generatedAt}`,
source: {
name: source.sourceName,
url: source.sourceUrl,
retrievedAt: generatedAt,
},
feeRows: [],
ruleSections: [],
notes: [source.note],
});
const scheduleData = [];
for (const source of federalSources) {
const data = await parseFederalSource(source);
scheduleData.push(data);
console.log(`${source.fileName}: ${data.dataStatus}, rows=${data.feeRows.length}, sections=${data.ruleSections.length}`);
}
for (const source of externalSources) {
const data = getExternalSourceData(source);
scheduleData.push(data);
console.log(`${source.fileName}: ${data.dataStatus}, rows=0, sections=0`);
}
const payload = {
schemaVersion: 1,
generatedAt,
catalogs: manifest.catalogs,
scheduleData,
};
fs.writeFileSync(outPath, `${JSON.stringify(payload, null, 2)}\n`);
console.log(`wrote ${path.relative(packageRoot, outPath)}`);