einvoice/test/suite/einvoice_parsing/test.parse-11.processing-instructions.ts
2025-05-25 19:45:37 +00:00

518 lines
16 KiB
TypeScript

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-11: Processing Instructions - Handle XML processing instructions', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-11');
await t.test('Basic processing instructions', async () => {
performanceTracker.startOperation('basic-pi');
const piTests = [
{
name: 'XML declaration',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<invoice>
<id>TEST-001</id>
</invoice>`,
target: 'xml',
data: 'version="1.0" encoding="UTF-8"',
description: 'Standard XML declaration'
},
{
name: 'Stylesheet processing instruction',
xml: `<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="invoice.xsl"?>
<invoice>
<id>TEST-002</id>
</invoice>`,
target: 'xml-stylesheet',
data: 'type="text/xsl" href="invoice.xsl"',
description: 'XSLT stylesheet reference'
},
{
name: 'Multiple processing instructions',
xml: `<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="invoice.xsl"?>
<?xml-model href="invoice.rnc" type="application/relax-ng-compact-syntax"?>
<?custom-pi data="value"?>
<invoice>
<id>TEST-003</id>
</invoice>`,
description: 'Multiple PIs before root element'
},
{
name: 'PI within document',
xml: `<?xml version="1.0"?>
<invoice>
<header>
<?page-break?>
<id>TEST-004</id>
</header>
<?custom-instruction param="value"?>
<body>
<amount>100.00</amount>
</body>
</invoice>`,
description: 'PIs inside document structure'
},
{
name: 'PI with no data',
xml: `<?xml version="1.0"?>
<invoice>
<?break?>
<id>TEST-005</id>
<?end?>
</invoice>`,
description: 'Processing instructions without parameters'
}
];
for (const test of piTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
if (test.target) {
console.log(` Target: ${test.target}`);
}
if (test.data) {
console.log(` Data: ${test.data}`);
}
console.log(` Description: ${test.description}`);
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(test.xml);
console.log(' ✓ Parsed with processing instructions');
} else {
console.log(' ⚠️ Cannot test without fromXmlString');
}
} catch (error) {
console.log(` ✗ Error: ${error.message}`);
}
performanceTracker.recordMetric('pi-parsing', performance.now() - startTime);
}
performanceTracker.endOperation('basic-pi');
});
await t.test('Processing instruction syntax rules', async () => {
performanceTracker.startOperation('pi-syntax');
const syntaxTests = [
{
name: 'Valid PI names',
valid: [
'<?valid-name data?>',
'<?name123 data?>',
'<?my-processor data?>',
'<?_underscore data?>'
],
invalid: [
'<?123name data?>', // Cannot start with number
'<?my name data?>', // No spaces in target
'<?xml data?>', // 'xml' is reserved
'<? data?>' // Must have target name
]
},
{
name: 'Reserved target names',
tests: [
{ pi: '<?xml version="1.0"?>', valid: true, note: 'XML declaration allowed' },
{ pi: '<?XML data?>', valid: false, note: 'Case variations of xml reserved' },
{ pi: '<?XmL data?>', valid: false, note: 'Any case of xml reserved' }
]
},
{
name: 'PI data requirements',
tests: [
{ pi: '<?target?>', valid: true, note: 'Empty data is valid' },
{ pi: '<?target ?>', valid: true, note: 'Whitespace only is valid' },
{ pi: '<?target cannot contain ??>', valid: false, note: 'Cannot contain ?>' },
{ pi: '<?target data with ? and > separately?>', valid: true, note: 'Can contain ? and > separately' }
]
}
];
for (const test of syntaxTests) {
console.log(`\n${test.name}:`);
if (test.valid && test.invalid) {
console.log(' Valid examples:');
for (const valid of test.valid) {
console.log(`${valid}`);
}
console.log(' Invalid examples:');
for (const invalid of test.invalid) {
console.log(`${invalid}`);
}
}
if (test.tests) {
for (const syntaxTest of test.tests) {
console.log(` ${syntaxTest.pi}`);
console.log(` ${syntaxTest.valid ? '✓' : '✗'} ${syntaxTest.note}`);
}
}
}
performanceTracker.endOperation('pi-syntax');
});
await t.test('Common processing instructions in e-invoices', async () => {
performanceTracker.startOperation('einvoice-pi');
const einvoicePIs = [
{
name: 'XSLT transformation',
xml: `<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="https://example.com/invoice-transform.xsl"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<ID>UBL-001</ID>
</Invoice>`,
purpose: 'Browser-based invoice rendering',
common: true
},
{
name: 'Schema validation hint',
xml: `<?xml version="1.0"?>
<?xml-model href="http://docs.oasis-open.org/ubl/os-UBL-2.1/xsd/maindoc/UBL-Invoice-2.1.xsd"
schematypens="http://www.w3.org/2001/XMLSchema"?>
<Invoice>
<ID>TEST-001</ID>
</Invoice>`,
purpose: 'Schema location for validation',
common: false
},
{
name: 'PDF generation instructions',
xml: `<?xml version="1.0"?>
<?pdf-generator version="2.0" profile="ZUGFeRD"?>
<?pdf-attachment filename="invoice.xml" relationship="Data"?>
<Invoice>
<ID>PDF-001</ID>
</Invoice>`,
purpose: 'PDF/A-3 generation hints',
common: false
},
{
name: 'Digital signature instructions',
xml: `<?xml version="1.0"?>
<?signature-method algorithm="RSA-SHA256"?>
<?signature-transform algorithm="http://www.w3.org/2001/10/xml-exc-c14n#"?>
<Invoice>
<ID>SIGNED-001</ID>
</Invoice>`,
purpose: 'Signing process configuration',
common: false
},
{
name: 'Format-specific processing',
xml: `<?xml version="1.0"?>
<?facturx-version 1.0?>
<?zugferd-profile EXTENDED?>
<rsm:CrossIndustryInvoice>
<rsm:ExchangedDocument>
<ram:ID>CII-001</ram:ID>
</rsm:ExchangedDocument>
</rsm:CrossIndustryInvoice>`,
purpose: 'Format-specific metadata',
common: false
}
];
for (const pi of einvoicePIs) {
console.log(`\n${pi.name}:`);
console.log(` Purpose: ${pi.purpose}`);
console.log(` Common in e-invoices: ${pi.common ? 'Yes' : 'No'}`);
const startTime = performance.now();
try {
// Extract PIs from XML
const piMatches = pi.xml.matchAll(/<\?([^?\s]+)([^?]*)\?>/g);
const pis = Array.from(piMatches);
console.log(` Found ${pis.length} processing instructions:`);
for (const [full, target, data] of pis) {
if (target !== 'xml') {
console.log(` <?${target}${data}?>`);
}
}
} catch (error) {
console.log(` Error analyzing PIs: ${error.message}`);
}
performanceTracker.recordMetric('einvoice-pi', performance.now() - startTime);
}
performanceTracker.endOperation('einvoice-pi');
});
await t.test('Processing instruction handling strategies', async () => {
performanceTracker.startOperation('pi-handling');
class PIHandler {
private handlers = new Map<string, (data: string) => void>();
register(target: string, handler: (data: string) => void): void {
this.handlers.set(target, handler);
}
process(xml: string): void {
const piRegex = /<\?([^?\s]+)([^?]*)\?>/g;
let match;
while ((match = piRegex.exec(xml)) !== null) {
const [full, target, data] = match;
if (target === 'xml') continue; // Skip XML declaration
const handler = this.handlers.get(target);
if (handler) {
console.log(` Processing <?${target}...?>`);
handler(data.trim());
} else {
console.log(` Ignoring unhandled PI: <?${target}...?>`);
}
}
}
}
const handler = new PIHandler();
// Register handlers for common PIs
handler.register('xml-stylesheet', (data) => {
const hrefMatch = data.match(/href="([^"]+)"/);
if (hrefMatch) {
console.log(` Stylesheet URL: ${hrefMatch[1]}`);
}
});
handler.register('pdf-generator', (data) => {
const versionMatch = data.match(/version="([^"]+)"/);
if (versionMatch) {
console.log(` PDF generator version: ${versionMatch[1]}`);
}
});
handler.register('page-break', (data) => {
console.log(' Page break instruction found');
});
// Test document
const testXml = `<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="invoice.xsl"?>
<?pdf-generator version="2.0" profile="ZUGFeRD"?>
<invoice>
<?page-break?>
<content>Test</content>
<?custom-pi unknown="true"?>
</invoice>`;
console.log('Processing instructions found:');
handler.process(testXml);
performanceTracker.endOperation('pi-handling');
});
await t.test('PI security considerations', async () => {
performanceTracker.startOperation('pi-security');
const securityTests = [
{
name: 'External resource reference',
pi: '<?xml-stylesheet href="http://malicious.com/steal-data.xsl"?>',
risk: 'SSRF, data exfiltration',
mitigation: 'Validate URLs, use allowlist'
},
{
name: 'Code execution hint',
pi: '<?execute-script language="javascript" code="alert(1)"?>',
risk: 'Arbitrary code execution',
mitigation: 'Never execute PI content as code'
},
{
name: 'File system access',
pi: '<?include-file path="/etc/passwd"?>',
risk: 'Local file disclosure',
mitigation: 'Ignore file system PIs'
},
{
name: 'Parser-specific instructions',
pi: '<?parser-config disable-security-checks="true"?>',
risk: 'Security bypass',
mitigation: 'Ignore parser configuration PIs'
}
];
console.log('Security considerations for processing instructions:');
for (const test of securityTests) {
console.log(`\n${test.name}:`);
console.log(` PI: ${test.pi}`);
console.log(` Risk: ${test.risk}`);
console.log(` Mitigation: ${test.mitigation}`);
}
console.log('\nBest practices:');
console.log(' 1. Whitelist allowed PI targets');
console.log(' 2. Validate all external references');
console.log(' 3. Never execute PI content as code');
console.log(' 4. Log suspicious PIs for monitoring');
console.log(' 5. Consider removing PIs in production');
performanceTracker.endOperation('pi-security');
});
await t.test('Corpus PI analysis', async () => {
performanceTracker.startOperation('corpus-pi');
const corpusLoader = new CorpusLoader();
const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
console.log(`\nAnalyzing processing instructions in ${xmlFiles.length} corpus files...`);
const piStats = {
total: 0,
filesWithPIs: 0,
piByTarget: new Map<string, number>(),
totalPIs: 0,
stylesheetRefs: 0,
otherExternalRefs: 0
};
const sampleSize = Math.min(100, xmlFiles.length);
const sampledFiles = xmlFiles.slice(0, sampleSize);
for (const file of sampledFiles) {
piStats.total++;
try {
const content = await plugins.fs.readFile(file.path, 'utf8');
// Find all PIs except XML declaration
const piMatches = content.matchAll(/<\?([^?\s]+)([^?]*)\?>/g);
const pis = Array.from(piMatches).filter(m => m[1] !== 'xml');
if (pis.length > 0) {
piStats.filesWithPIs++;
piStats.totalPIs += pis.length;
for (const [full, target, data] of pis) {
piStats.piByTarget.set(
target,
(piStats.piByTarget.get(target) || 0) + 1
);
// Check for external references
if (target === 'xml-stylesheet') {
piStats.stylesheetRefs++;
} else if (data.includes('href=') || data.includes('src=')) {
piStats.otherExternalRefs++;
}
}
}
} catch (error) {
// Skip files that can't be read
}
}
console.log('\nProcessing Instruction Statistics:');
console.log(`Files analyzed: ${piStats.total}`);
console.log(`Files with PIs: ${piStats.filesWithPIs} (${(piStats.filesWithPIs/piStats.total*100).toFixed(1)}%)`);
console.log(`Total PIs found: ${piStats.totalPIs}`);
console.log(`Stylesheet references: ${piStats.stylesheetRefs}`);
console.log(`Other external references: ${piStats.otherExternalRefs}`);
if (piStats.piByTarget.size > 0) {
console.log('\nPI targets found:');
const sortedTargets = Array.from(piStats.piByTarget.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10);
for (const [target, count] of sortedTargets) {
console.log(` <?${target}...?>: ${count} occurrences`);
}
}
performanceTracker.endOperation('corpus-pi');
});
await t.test('PI performance impact', async () => {
performanceTracker.startOperation('pi-performance');
// Generate documents with varying PI counts
const generateXmlWithPIs = (piCount: number): string => {
let xml = '<?xml version="1.0"?>\n';
// Add various PIs
for (let i = 0; i < piCount; i++) {
xml += `<?pi-${i} data="value${i}" param="test"?>\n`;
}
xml += '<invoice>\n';
// Add some PIs within document
for (let i = 0; i < piCount / 2; i++) {
xml += ` <?internal-pi-${i}?>\n`;
xml += ` <field${i}>Value ${i}</field${i}>\n`;
}
xml += '</invoice>';
return xml;
};
console.log('Performance impact of processing instructions:');
const testCounts = [0, 10, 50, 100];
for (const count of testCounts) {
const xml = generateXmlWithPIs(count);
const xmlSize = Buffer.byteLength(xml, 'utf8');
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(xml);
}
const parseTime = performance.now() - startTime;
console.log(` ${count} PIs (${(xmlSize/1024).toFixed(1)}KB): ${parseTime.toFixed(2)}ms`);
if (count > 0) {
console.log(` Time per PI: ${(parseTime/count).toFixed(3)}ms`);
}
performanceTracker.recordMetric(`pi-count-${count}`, parseTime);
} catch (error) {
console.log(` Error with ${count} PIs: ${error.message}`);
}
}
performanceTracker.endOperation('pi-performance');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// PI best practices
console.log('\nProcessing Instruction Best Practices:');
console.log('1. Preserve PIs during document processing');
console.log('2. Validate external references for security');
console.log('3. Support common PIs (xml-stylesheet)');
console.log('4. Allow custom PI handlers for extensibility');
console.log('5. Ignore unknown PIs gracefully');
console.log('6. Never execute PI content as code');
console.log('7. Consider PI impact on performance');
console.log('8. Document which PIs are supported');
});
tap.start();