fee-schedules/scripts/generate-germany.mjs

import { spawnSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';

const packageRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
const sourceRoot = path.join(packageRoot, '.nogit', 'fee-schedules-sources');
const onlyGitDir = path.join(packageRoot, '.onlygit');
const manifestPath = path.join(onlyGitDir, 'fee-schedules.sources.json');
const outPath = path.join(onlyGitDir, 'fee-schedules.json');
const generatedAt = new Date().toISOString().slice(0, 10);

const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
const federalSources = manifest.generationSources.federal;
const externalSources = manifest.generationSources.external;

fs.mkdirSync(sourceRoot, { recursive: true });
fs.mkdirSync(onlyGitDir, { recursive: true });

const decodeEntities = (input) => input
  .replace(/&nbsp;|&#160;/g, ' ')
  .replace(/&amp;/g, '&')
  .replace(/&lt;/g, '<')
  .replace(/&gt;/g, '>')
  .replace(/&quot;/g, '"')
  .replace(/&#39;/g, "'")
  .replace(/&#167;/g, '§')
  .replace(/&#196;/g, 'Ä')
  .replace(/&#214;/g, 'Ö')
  .replace(/&#220;/g, 'Ü')
  .replace(/&#228;/g, 'ä')
  .replace(/&#246;/g, 'ö')
  .replace(/&#252;/g, 'ü')
  .replace(/&#223;/g, 'ß');

const cleanXmlText = (input) => decodeEntities(input)
  .replace(/<BR\s*\/?\s*>/gi, ' ')
  .replace(/<[^>]+>/g, ' ')
  .replace(/\s+/g, ' ')
  .trim();

const extractFirst = (xml, tag) => {
  const match = xml.match(new RegExp(`<${tag}(?:\\s[^>]*)?>([\\s\\S]*?)<\\/${tag}>`));
  return match ? cleanXmlText(match[1]) : undefined;
};

const isCode = (value) => /^(?:\d{1,6}[a-z]?|[A-Z]\d{1,5}|[A-Z]{1,3}\s?\d{1,5})$/.test(value);
const hasNumericValue = (value) => /\d/.test(value) && /^[\d\s.,€–\-]+$/.test(value);

const parseAmount = (value) => {
  const numeric = value.match(/\d+(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?/g)?.at(-1);
  if (!numeric) {
    return undefined;
  }

  return Number(numeric.replace(/\./g, '').replace(',', '.'));
};

const getRowType = (cells, mode) => {
  if (mode === 'code' && isCode(cells[0])) {
    return 'fee-entry';
  }

  if (mode === 'table' && cells.length >= 2 && cells.filter(hasNumericValue).length >= 2) {
    return 'fee-table-row';
  }

  return undefined;
};

const downloadFederalXml = async (source) => {
  const zipUrl = `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`;
  const zipPath = path.join(sourceRoot, `${source.sourcePath}.zip`);
  const extractDir = path.join(sourceRoot, source.sourcePath);

  fs.mkdirSync(extractDir, { recursive: true });

  const response = await fetch(zipUrl);
  if (!response.ok) {
    throw new Error(`Failed to download ${zipUrl}: ${response.status} ${response.statusText}`);
  }

  fs.writeFileSync(zipPath, Buffer.from(await response.arrayBuffer()));

  const unzip = spawnSync('/usr/bin/unzip', ['-o', '-q', zipPath, '-d', extractDir], {
    stdio: 'pipe',
    encoding: 'utf8',
  });

  if (unzip.status !== 0) {
    throw new Error(`Failed to unzip ${zipPath}: ${unzip.stderr || unzip.stdout}`);
  }

  const xmlFiles = fs.readdirSync(extractDir).filter((entry) => entry.endsWith('.xml'));
  if (xmlFiles.length !== 1) {
    throw new Error(`Expected one XML file for ${source.sourcePath}, found ${xmlFiles.length}`);
  }

  return path.join(extractDir, xmlFiles[0]);
};

const getMetadata = (firstNorm) => {
  const statusNotes = [...firstNorm.matchAll(/<standkommentar>([\s\S]*?)<\/standkommentar>/g)]
    .map((match) => cleanXmlText(match[1]));

  return {
    officialAbbreviation: extractFirst(firstNorm, 'amtabk'),
    legalAbbreviation: extractFirst(firstNorm, 'jurabk'),
    title: extractFirst(firstNorm, 'langue'),
    issuedAt: extractFirst(firstNorm, 'ausfertigung-datum'),
    statusNotes,
  };
};

const parseFederalSource = async (source) => {
  const xmlPath = await downloadFederalXml(source);
  const xml = fs.readFileSync(xmlPath, 'utf8');
  const sourceFileName = path.basename(xmlPath);
  const firstNorm = xml.match(/<norm[\s\S]*?<\/norm>/)?.[0] || '';
  const metadata = getMetadata(firstNorm);
  const norms = [...xml.matchAll(/<norm\b[^>]*doknr="([^"]+)"[^>]*>([\s\S]*?)<\/norm>/g)];

  const ruleSections = norms.map((match, index) => {
    const sourceNormId = match[1];
    const normXml = match[2];
    const reference = extractFirst(normXml, 'enbez') || metadata.officialAbbreviation || source.scheduleId;
    const title = extractFirst(normXml, 'titel');
    const contentMatch = normXml.match(/<Content>([\s\S]*?)<\/Content>/);
    const text = contentMatch ? cleanXmlText(contentMatch[1]) : '';

    return {
      id: `${source.scheduleId}-section-${index + 1}`,
      scheduleId: source.scheduleId,
      sourceNormId,
      reference,
      title,
      text,
    };
  }).filter((section) => section.text);

  const feeRows = [];
  let rowIndex = 0;

  for (const rowMatch of xml.matchAll(/<row[\s\S]*?<\/row>/g)) {
    const cells = [...rowMatch[0].matchAll(/<entry[^>]*>([\s\S]*?)<\/entry>/g)]
      .map((entryMatch) => cleanXmlText(entryMatch[1]))
      .filter(Boolean);

    if (cells.length < 2) {
      continue;
    }

    const rowType = getRowType(cells, source.rowMode);
    if (!rowType) {
      continue;
    }

    rowIndex += 1;
    const thirdCell = cells[2];
    const lastCell = cells.at(-1) || '';
    const points = rowType === 'fee-entry' && thirdCell && /^\d+$/.test(thirdCell)
      ? Number(thirdCell)
      : undefined;
    const amountEur = lastCell.includes('€') || (rowType === 'fee-entry' && cells.length >= 4)
      ? parseAmount(lastCell)
      : undefined;

    feeRows.push({
      id: `${source.scheduleId}-row-${rowIndex}`,
      scheduleId: source.scheduleId,
      rowType,
      rowIndex,
      code: rowType === 'fee-entry' ? cells[0] : undefined,
      description: rowType === 'fee-entry' ? cells[1] : undefined,
      cells,
      points,
      amountEur,
      sourceNormId: undefined,
    });
  }

  return {
    scheduleId: source.scheduleId,
    dataStatus: feeRows.length ? 'federal-law-fee-data' : 'federal-law-rules-only',
    edition: metadata.statusNotes.join(' ') || `Retrieved ${generatedAt}`,
    source: {
      name: 'Gesetze im Internet',
      url: `https://www.gesetze-im-internet.de/${source.sourcePath}/xml.zip`,
      pageUrl: `https://www.gesetze-im-internet.de/${source.sourcePath}/`,
      retrievedAt: generatedAt,
      sourceFileName,
      officialAbbreviation: metadata.officialAbbreviation,
      legalAbbreviation: metadata.legalAbbreviation,
      title: metadata.title,
      issuedAt: metadata.issuedAt,
      statusNotes: metadata.statusNotes,
    },
    feeRows,
    ruleSections,
    notes: feeRows.length
      ? ['Generated from official federal XML table rows. Original row cells are preserved in order.']
      : ['No separate fee table rows were present in the federal XML. Fee rules are represented as rule sections.'],
  };
};

const getExternalSourceData = (source) => ({
  scheduleId: source.scheduleId,
  dataStatus: 'external-source-pending',
  edition: `Source identified ${generatedAt}`,
  source: {
    name: source.sourceName,
    url: source.sourceUrl,
    retrievedAt: generatedAt,
  },
  feeRows: [],
  ruleSections: [],
  notes: [source.note],
});

const scheduleData = [];

for (const source of federalSources) {
  const data = await parseFederalSource(source);
  scheduleData.push(data);
  console.log(`${source.fileName}: ${data.dataStatus}, rows=${data.feeRows.length}, sections=${data.ruleSections.length}`);
}

for (const source of externalSources) {
  const data = getExternalSourceData(source);
  scheduleData.push(data);
  console.log(`${source.fileName}: ${data.dataStatus}, rows=0, sections=0`);
}

const payload = {
  schemaVersion: 1,
  generatedAt,
  catalogs: manifest.catalogs,
  scheduleData,
};

fs.writeFileSync(outPath, `${JSON.stringify(payload, null, 2)}\n`);
console.log(`wrote ${path.relative(packageRoot, outPath)}`);