import { spawnSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; const packageRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); const sourceRoot = path.join(packageRoot, '.nogit', 'fee-schedules-sources'); const onlyGitDir = path.join(packageRoot, '.onlygit'); const manifestPath = path.join(onlyGitDir, 'fee-schedules.sources.json'); const outPath = path.join(onlyGitDir, 'fee-schedules.json'); const generatedAt = new Date().toISOString().slice(0, 10); const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8')); const federalSources = manifest.generationSources.federal; const externalSources = manifest.generationSources.external; fs.mkdirSync(sourceRoot, { recursive: true }); fs.mkdirSync(onlyGitDir, { recursive: true }); const decodeEntities = (input) => input .replace(/ | /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/§/g, '§') .replace(/Ä/g, 'Ä') .replace(/Ö/g, 'Ö') .replace(/Ü/g, 'Ü') .replace(/ä/g, 'ä') .replace(/ö/g, 'ö') .replace(/ü/g, 'ü') .replace(/ß/g, 'ß'); const cleanXmlText = (input) => decodeEntities(input) .replace(//gi, ' ') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim(); const extractFirst = (xml, tag) => { const match = xml.match(new RegExp(`<${tag}(?:\\s[^>]*)?>([\\s\\S]*?)<\\/${tag}>`)); return match ? cleanXmlText(match[1]) : undefined; }; const isCode = (value) => /^(?:\d{1,6}[a-z]?|[A-Z]\d{1,5}|[A-Z]{1,3}\s?\d{1,5})$/.test(value); const hasNumericValue = (value) => /\d/.test(value) && /^[\d\s.,€–\-]+$/.test(value); const parseAmount = (value) => { const numeric = value.match(/\d+(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?/g)?.at(-1); if (!numeric) { return undefined; } return Number(numeric.replace(/\./g, '').replace(',', '.')); }; const getRowType = (cells, mode) => { if (mode === 'code' && isCode(cells[0])) { return 'fee-entry'; } if (mode === 'table' && cells.length >= 2 && cells.filter(hasNumericValue).length >= 2) { return 'fee-table-row'; } return undefined; }; const downloadFederalXml = async (source) => { const zipUrl = `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`; const zipPath = path.join(sourceRoot, `${source.sourcePath}.zip`); const extractDir = path.join(sourceRoot, source.sourcePath); fs.mkdirSync(extractDir, { recursive: true }); const response = await fetch(zipUrl); if (!response.ok) { throw new Error(`Failed to download ${zipUrl}: ${response.status} ${response.statusText}`); } fs.writeFileSync(zipPath, Buffer.from(await response.arrayBuffer())); const unzip = spawnSync('/usr/bin/unzip', ['-o', '-q', zipPath, '-d', extractDir], { stdio: 'pipe', encoding: 'utf8', }); if (unzip.status !== 0) { throw new Error(`Failed to unzip ${zipPath}: ${unzip.stderr || unzip.stdout}`); } const xmlFiles = fs.readdirSync(extractDir).filter((entry) => entry.endsWith('.xml')); if (xmlFiles.length !== 1) { throw new Error(`Expected one XML file for ${source.sourcePath}, found ${xmlFiles.length}`); } return path.join(extractDir, xmlFiles[0]); }; const getMetadata = (firstNorm) => { const statusNotes = [...firstNorm.matchAll(/([\s\S]*?)<\/standkommentar>/g)] .map((match) => cleanXmlText(match[1])); return { officialAbbreviation: extractFirst(firstNorm, 'amtabk'), legalAbbreviation: extractFirst(firstNorm, 'jurabk'), title: extractFirst(firstNorm, 'langue'), issuedAt: extractFirst(firstNorm, 'ausfertigung-datum'), statusNotes, }; }; const parseFederalSource = async (source) => { const xmlPath = await downloadFederalXml(source); const xml = fs.readFileSync(xmlPath, 'utf8'); const sourceFileName = path.basename(xmlPath); const firstNorm = xml.match(//)?.[0] || ''; const metadata = getMetadata(firstNorm); const norms = [...xml.matchAll(/]*doknr="([^"]+)"[^>]*>([\s\S]*?)<\/norm>/g)]; const ruleSections = norms.map((match, index) => { const sourceNormId = match[1]; const normXml = match[2]; const reference = extractFirst(normXml, 'enbez') || metadata.officialAbbreviation || source.scheduleId; const title = extractFirst(normXml, 'titel'); const contentMatch = normXml.match(/([\s\S]*?)<\/Content>/); const text = contentMatch ? cleanXmlText(contentMatch[1]) : ''; return { id: `${source.scheduleId}-section-${index + 1}`, scheduleId: source.scheduleId, sourceNormId, reference, title, text, }; }).filter((section) => section.text); const feeRows = []; let rowIndex = 0; for (const rowMatch of xml.matchAll(//g)) { const cells = [...rowMatch[0].matchAll(/]*>([\s\S]*?)<\/entry>/g)] .map((entryMatch) => cleanXmlText(entryMatch[1])) .filter(Boolean); if (cells.length < 2) { continue; } const rowType = getRowType(cells, source.rowMode); if (!rowType) { continue; } rowIndex += 1; const thirdCell = cells[2]; const lastCell = cells.at(-1) || ''; const points = rowType === 'fee-entry' && thirdCell && /^\d+$/.test(thirdCell) ? Number(thirdCell) : undefined; const amountEur = lastCell.includes('€') || (rowType === 'fee-entry' && cells.length >= 4) ? parseAmount(lastCell) : undefined; feeRows.push({ id: `${source.scheduleId}-row-${rowIndex}`, scheduleId: source.scheduleId, rowType, rowIndex, code: rowType === 'fee-entry' ? cells[0] : undefined, description: rowType === 'fee-entry' ? cells[1] : undefined, cells, points, amountEur, sourceNormId: undefined, }); } return { scheduleId: source.scheduleId, dataStatus: feeRows.length ? 'federal-law-fee-data' : 'federal-law-rules-only', edition: metadata.statusNotes.join(' ') || `Retrieved ${generatedAt}`, source: { name: 'Gesetze im Internet', url: `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`, pageUrl: `https://www.gesetze-im-internet.de/${source.sourcePath}/`, retrievedAt: generatedAt, sourceFileName, officialAbbreviation: metadata.officialAbbreviation, legalAbbreviation: metadata.legalAbbreviation, title: metadata.title, issuedAt: metadata.issuedAt, statusNotes: metadata.statusNotes, }, feeRows, ruleSections, notes: feeRows.length ? ['Generated from official federal XML table rows. Original row cells are preserved in order.'] : ['No separate fee table rows were present in the federal XML. Fee rules are represented as rule sections.'], }; }; const getExternalSourceData = (source) => ({ scheduleId: source.scheduleId, dataStatus: 'external-source-pending', edition: `Source identified ${generatedAt}`, source: { name: source.sourceName, url: source.sourceUrl, retrievedAt: generatedAt, }, feeRows: [], ruleSections: [], notes: [source.note], }); const scheduleData = []; for (const source of federalSources) { const data = await parseFederalSource(source); scheduleData.push(data); console.log(`${source.fileName}: ${data.dataStatus}, rows=${data.feeRows.length}, sections=${data.ruleSections.length}`); } for (const source of externalSources) { const data = getExternalSourceData(source); scheduleData.push(data); console.log(`${source.fileName}: ${data.dataStatus}, rows=0, sections=0`); } const payload = { schemaVersion: 1, generatedAt, catalogs: manifest.catalogs, scheduleData, }; fs.writeFileSync(outPath, `${JSON.stringify(payload, null, 2)}\n`); console.log(`wrote ${path.relative(packageRoot, outPath)}`);