516 lines
17 KiB
TypeScript
516 lines
17 KiB
TypeScript
|
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
||
|
import * as einvoice from '../../../ts/index.js';
|
||
|
import * as plugins from '../../plugins.js';
|
||
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
||
|
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
||
|
|
||
|
tap.test('PARSE-10: CDATA Section Handling - Process CDATA sections correctly', async (t) => {
|
||
|
const performanceTracker = new PerformanceTracker('PARSE-10');
|
||
|
|
||
|
await t.test('Basic CDATA sections', async () => {
|
||
|
performanceTracker.startOperation('basic-cdata');
|
||
|
|
||
|
const cdataTests = [
|
||
|
{
|
||
|
name: 'Simple CDATA content',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<notes><![CDATA[This is plain text content]]></notes>
|
||
|
</invoice>`,
|
||
|
expectedContent: 'This is plain text content',
|
||
|
description: 'Basic CDATA section'
|
||
|
},
|
||
|
{
|
||
|
name: 'CDATA with special characters',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<description><![CDATA[Price < 100 & quantity > 5]]></description>
|
||
|
</invoice>`,
|
||
|
expectedContent: 'Price < 100 & quantity > 5',
|
||
|
description: 'Special characters preserved'
|
||
|
},
|
||
|
{
|
||
|
name: 'CDATA with XML-like content',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<htmlContent><![CDATA[<p>This is <b>HTML</b> content</p>]]></htmlContent>
|
||
|
</invoice>`,
|
||
|
expectedContent: '<p>This is <b>HTML</b> content</p>',
|
||
|
description: 'XML markup as text'
|
||
|
},
|
||
|
{
|
||
|
name: 'Empty CDATA section',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<empty><![CDATA[]]></empty>
|
||
|
</invoice>`,
|
||
|
expectedContent: '',
|
||
|
description: 'Empty CDATA is valid'
|
||
|
},
|
||
|
{
|
||
|
name: 'CDATA with line breaks',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<address><![CDATA[Line 1
|
||
|
Line 2
|
||
|
Line 3]]></address>
|
||
|
</invoice>`,
|
||
|
expectedContent: 'Line 1\nLine 2\nLine 3',
|
||
|
description: 'Preserves formatting'
|
||
|
}
|
||
|
];
|
||
|
|
||
|
for (const test of cdataTests) {
|
||
|
const startTime = performance.now();
|
||
|
|
||
|
console.log(`${test.name}:`);
|
||
|
console.log(` Description: ${test.description}`);
|
||
|
console.log(` Expected content: "${test.expectedContent}"`);
|
||
|
|
||
|
try {
|
||
|
const invoice = new einvoice.EInvoice();
|
||
|
if (invoice.fromXmlString) {
|
||
|
await invoice.fromXmlString(test.xml);
|
||
|
console.log(' ✓ CDATA parsed successfully');
|
||
|
} else {
|
||
|
console.log(' ⚠️ Cannot test without fromXmlString');
|
||
|
}
|
||
|
} catch (error) {
|
||
|
console.log(` ✗ Error: ${error.message}`);
|
||
|
}
|
||
|
|
||
|
performanceTracker.recordMetric('cdata-parsing', performance.now() - startTime);
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('basic-cdata');
|
||
|
});
|
||
|
|
||
|
await t.test('CDATA edge cases', async () => {
|
||
|
performanceTracker.startOperation('cdata-edge-cases');
|
||
|
|
||
|
const edgeCases = [
|
||
|
{
|
||
|
name: 'Nested CDATA-like content',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<code><![CDATA[if (text.includes("<![CDATA[")) { /* handle nested */ }]]></code>
|
||
|
</invoice>`,
|
||
|
note: 'CDATA end sequence in content needs escaping',
|
||
|
challenge: 'Cannot nest CDATA sections'
|
||
|
},
|
||
|
{
|
||
|
name: 'CDATA end sequence in content',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<script><![CDATA[
|
||
|
// This would end CDATA: ]]>
|
||
|
// Must be split: ]]]]><![CDATA[>
|
||
|
]]></script>
|
||
|
</invoice>`,
|
||
|
note: 'End sequence must be escaped',
|
||
|
challenge: 'Split ]]> into ]] and >'
|
||
|
},
|
||
|
{
|
||
|
name: 'Multiple CDATA sections',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<content>
|
||
|
<![CDATA[Part 1]]>
|
||
|
Normal text
|
||
|
<![CDATA[Part 2]]>
|
||
|
</content>
|
||
|
</invoice>`,
|
||
|
note: 'Multiple CDATA in same element',
|
||
|
challenge: 'Proper content concatenation'
|
||
|
},
|
||
|
{
|
||
|
name: 'CDATA in attributes (invalid)',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<item description="<![CDATA[Not allowed]]>">Content</item>
|
||
|
</invoice>`,
|
||
|
note: 'CDATA not allowed in attributes',
|
||
|
challenge: 'Should cause parse error'
|
||
|
},
|
||
|
{
|
||
|
name: 'Whitespace around CDATA',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<invoice>
|
||
|
<padded> <![CDATA[Content]]> </padded>
|
||
|
</invoice>`,
|
||
|
note: 'Whitespace outside CDATA preserved',
|
||
|
challenge: 'Handle mixed content correctly'
|
||
|
}
|
||
|
];
|
||
|
|
||
|
for (const test of edgeCases) {
|
||
|
const startTime = performance.now();
|
||
|
|
||
|
console.log(`\n${test.name}:`);
|
||
|
console.log(` Note: ${test.note}`);
|
||
|
console.log(` Challenge: ${test.challenge}`);
|
||
|
|
||
|
try {
|
||
|
const invoice = new einvoice.EInvoice();
|
||
|
if (invoice.fromXmlString) {
|
||
|
await invoice.fromXmlString(test.xml);
|
||
|
console.log(' Result: Parsed successfully');
|
||
|
}
|
||
|
} catch (error) {
|
||
|
console.log(` Result: ${error.message}`);
|
||
|
}
|
||
|
|
||
|
performanceTracker.recordMetric('edge-case', performance.now() - startTime);
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('cdata-edge-cases');
|
||
|
});
|
||
|
|
||
|
await t.test('CDATA vs escaped content comparison', async () => {
|
||
|
performanceTracker.startOperation('cdata-vs-escaped');
|
||
|
|
||
|
const comparisonTests = [
|
||
|
{
|
||
|
name: 'Special characters',
|
||
|
cdata: '<note><![CDATA[Price < 100 & quantity > 5]]></note>',
|
||
|
escaped: '<note>Price < 100 & quantity > 5</note>',
|
||
|
content: 'Price < 100 & quantity > 5'
|
||
|
},
|
||
|
{
|
||
|
name: 'HTML snippet',
|
||
|
cdata: '<html><![CDATA[<div class="invoice">Content</div>]]></html>',
|
||
|
escaped: '<html><div class="invoice">Content</div></html>',
|
||
|
content: '<div class="invoice">Content</div>'
|
||
|
},
|
||
|
{
|
||
|
name: 'Code snippet',
|
||
|
cdata: '<code><![CDATA[if (a && b) { return "result"; }]]></code>',
|
||
|
escaped: '<code>if (a && b) { return "result"; }</code>',
|
||
|
content: 'if (a && b) { return "result"; }'
|
||
|
},
|
||
|
{
|
||
|
name: 'Quote marks',
|
||
|
cdata: '<quote><![CDATA[He said "Hello" and she said \'Hi\']]></quote>',
|
||
|
escaped: '<quote>He said "Hello" and she said 'Hi'</quote>',
|
||
|
content: 'He said "Hello" and she said \'Hi\''
|
||
|
}
|
||
|
];
|
||
|
|
||
|
console.log('CDATA vs Escaped Content:');
|
||
|
|
||
|
for (const test of comparisonTests) {
|
||
|
console.log(`\n${test.name}:`);
|
||
|
console.log(` Expected content: "${test.content}"`);
|
||
|
console.log(` CDATA approach: More readable, preserves content as-is`);
|
||
|
console.log(` Escaped approach: Standard XML, but less readable`);
|
||
|
|
||
|
// Compare sizes
|
||
|
const cdataSize = Buffer.byteLength(test.cdata, 'utf8');
|
||
|
const escapedSize = Buffer.byteLength(test.escaped, 'utf8');
|
||
|
|
||
|
console.log(` Size comparison: CDATA=${cdataSize}B, Escaped=${escapedSize}B`);
|
||
|
if (cdataSize < escapedSize) {
|
||
|
console.log(` CDATA is ${escapedSize - cdataSize} bytes smaller`);
|
||
|
} else {
|
||
|
console.log(` Escaped is ${cdataSize - escapedSize} bytes smaller`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('cdata-vs-escaped');
|
||
|
});
|
||
|
|
||
|
await t.test('CDATA in e-invoice contexts', async () => {
|
||
|
performanceTracker.startOperation('einvoice-cdata');
|
||
|
|
||
|
const einvoiceUseCases = [
|
||
|
{
|
||
|
name: 'Terms and conditions',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<Invoice>
|
||
|
<PaymentTerms>
|
||
|
<Note><![CDATA[
|
||
|
Payment Terms & Conditions:
|
||
|
1. Payment due within 30 days
|
||
|
2. Late payment fee: 2% per month
|
||
|
3. Disputes must be raised within 7 days
|
||
|
|
||
|
For more info visit: https://example.com/terms
|
||
|
]]></Note>
|
||
|
</PaymentTerms>
|
||
|
</Invoice>`,
|
||
|
useCase: 'Legal text with special characters'
|
||
|
},
|
||
|
{
|
||
|
name: 'Product description with HTML',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<Invoice>
|
||
|
<InvoiceLine>
|
||
|
<Item>
|
||
|
<Description><![CDATA[
|
||
|
<h3>Premium Widget</h3>
|
||
|
<ul>
|
||
|
<li>Dimension: 10cm x 5cm x 3cm</li>
|
||
|
<li>Weight: < 500g</li>
|
||
|
<li>Price: €99.99</li>
|
||
|
</ul>
|
||
|
]]></Description>
|
||
|
</Item>
|
||
|
</InvoiceLine>
|
||
|
</Invoice>`,
|
||
|
useCase: 'Rich text product descriptions'
|
||
|
},
|
||
|
{
|
||
|
name: 'Base64 encoded attachment',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<Invoice>
|
||
|
<AdditionalDocumentReference>
|
||
|
<Attachment>
|
||
|
<EmbeddedDocumentBinaryObject mimeCode="application/pdf">
|
||
|
<![CDATA[JVBERi0xLjQKJeLjz9MKCjEgMCBvYmoKPDwKL1R5cGUgL0NhdGFsb2cKL1BhZ2VzIDIgMCBSCj4+CmVuZG9iag==]]>
|
||
|
</EmbeddedDocumentBinaryObject>
|
||
|
</Attachment>
|
||
|
</AdditionalDocumentReference>
|
||
|
</Invoice>`,
|
||
|
useCase: 'Binary data encoding'
|
||
|
},
|
||
|
{
|
||
|
name: 'Custom XML extensions',
|
||
|
xml: `<?xml version="1.0"?>
|
||
|
<Invoice>
|
||
|
<UBLExtensions>
|
||
|
<UBLExtension>
|
||
|
<ExtensionContent><![CDATA[
|
||
|
<CustomData xmlns="http://example.com/custom">
|
||
|
<Field1>Value with < and > chars</Field1>
|
||
|
<Field2>Complex & data</Field2>
|
||
|
</CustomData>
|
||
|
]]></ExtensionContent>
|
||
|
</UBLExtension>
|
||
|
</UBLExtensions>
|
||
|
</Invoice>`,
|
||
|
useCase: 'Embedded XML without namespace conflicts'
|
||
|
}
|
||
|
];
|
||
|
|
||
|
for (const useCase of einvoiceUseCases) {
|
||
|
console.log(`\n${useCase.name}:`);
|
||
|
console.log(` Use case: ${useCase.useCase}`);
|
||
|
|
||
|
const startTime = performance.now();
|
||
|
|
||
|
try {
|
||
|
const invoice = new einvoice.EInvoice();
|
||
|
if (invoice.fromXmlString) {
|
||
|
await invoice.fromXmlString(useCase.xml);
|
||
|
console.log(' ✓ Valid e-invoice usage of CDATA');
|
||
|
}
|
||
|
} catch (error) {
|
||
|
console.log(` ⚠️ Parse result: ${error.message}`);
|
||
|
}
|
||
|
|
||
|
performanceTracker.recordMetric('einvoice-usecase', performance.now() - startTime);
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('einvoice-cdata');
|
||
|
});
|
||
|
|
||
|
await t.test('CDATA performance impact', async () => {
|
||
|
performanceTracker.startOperation('cdata-performance');
|
||
|
|
||
|
// Generate test documents with varying CDATA usage
|
||
|
const generateInvoiceWithCDATA = (cdataCount: number, cdataSize: number): string => {
|
||
|
let xml = '<?xml version="1.0"?>\n<invoice>\n';
|
||
|
|
||
|
for (let i = 0; i < cdataCount; i++) {
|
||
|
const content = 'X'.repeat(cdataSize);
|
||
|
xml += ` <field${i}><![CDATA[${content}]]></field${i}>\n`;
|
||
|
}
|
||
|
|
||
|
xml += '</invoice>';
|
||
|
return xml;
|
||
|
};
|
||
|
|
||
|
const generateInvoiceEscaped = (fieldCount: number, contentSize: number): string => {
|
||
|
let xml = '<?xml version="1.0"?>\n<invoice>\n';
|
||
|
|
||
|
for (let i = 0; i < fieldCount; i++) {
|
||
|
// Content with characters that need escaping
|
||
|
const content = 'X&<>X'.repeat(contentSize / 5);
|
||
|
const escaped = content.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
||
|
xml += ` <field${i}>${escaped}</field${i}>\n`;
|
||
|
}
|
||
|
|
||
|
xml += '</invoice>';
|
||
|
return xml;
|
||
|
};
|
||
|
|
||
|
console.log('Performance comparison:');
|
||
|
|
||
|
const testConfigs = [
|
||
|
{ fields: 10, contentSize: 100 },
|
||
|
{ fields: 50, contentSize: 500 },
|
||
|
{ fields: 100, contentSize: 1000 }
|
||
|
];
|
||
|
|
||
|
for (const config of testConfigs) {
|
||
|
console.log(`\n${config.fields} fields, ${config.contentSize} chars each:`);
|
||
|
|
||
|
// Test CDATA version
|
||
|
const cdataXml = generateInvoiceWithCDATA(config.fields, config.contentSize);
|
||
|
const cdataSize = Buffer.byteLength(cdataXml, 'utf8');
|
||
|
|
||
|
const cdataStart = performance.now();
|
||
|
try {
|
||
|
const invoice = new einvoice.EInvoice();
|
||
|
if (invoice.fromXmlString) {
|
||
|
await invoice.fromXmlString(cdataXml);
|
||
|
}
|
||
|
} catch (e) {}
|
||
|
const cdataTime = performance.now() - cdataStart;
|
||
|
|
||
|
// Test escaped version
|
||
|
const escapedXml = generateInvoiceEscaped(config.fields, config.contentSize);
|
||
|
const escapedSize = Buffer.byteLength(escapedXml, 'utf8');
|
||
|
|
||
|
const escapedStart = performance.now();
|
||
|
try {
|
||
|
const invoice = new einvoice.EInvoice();
|
||
|
if (invoice.fromXmlString) {
|
||
|
await invoice.fromXmlString(escapedXml);
|
||
|
}
|
||
|
} catch (e) {}
|
||
|
const escapedTime = performance.now() - escapedStart;
|
||
|
|
||
|
console.log(` CDATA: ${cdataTime.toFixed(2)}ms (${(cdataSize/1024).toFixed(1)}KB)`);
|
||
|
console.log(` Escaped: ${escapedTime.toFixed(2)}ms (${(escapedSize/1024).toFixed(1)}KB)`);
|
||
|
console.log(` Difference: ${((escapedTime - cdataTime) / cdataTime * 100).toFixed(1)}%`);
|
||
|
|
||
|
performanceTracker.recordMetric(`perf-${config.fields}fields`, cdataTime);
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('cdata-performance');
|
||
|
});
|
||
|
|
||
|
await t.test('Corpus CDATA usage analysis', async () => {
|
||
|
performanceTracker.startOperation('corpus-cdata');
|
||
|
|
||
|
const corpusLoader = new CorpusLoader();
|
||
|
const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
|
||
|
|
||
|
console.log(`\nAnalyzing CDATA usage in ${xmlFiles.length} corpus files...`);
|
||
|
|
||
|
const cdataStats = {
|
||
|
total: 0,
|
||
|
filesWithCDATA: 0,
|
||
|
totalCDATASections: 0,
|
||
|
cdataByElement: new Map<string, number>(),
|
||
|
largestCDATA: 0,
|
||
|
commonPatterns: new Map<string, number>()
|
||
|
};
|
||
|
|
||
|
const sampleSize = Math.min(100, xmlFiles.length);
|
||
|
const sampledFiles = xmlFiles.slice(0, sampleSize);
|
||
|
|
||
|
for (const file of sampledFiles) {
|
||
|
cdataStats.total++;
|
||
|
|
||
|
try {
|
||
|
const content = await plugins.fs.readFile(file.path, 'utf8');
|
||
|
|
||
|
// Find all CDATA sections
|
||
|
const cdataMatches = content.matchAll(/<!\[CDATA\[([\s\S]*?)\]\]>/g);
|
||
|
const cdataSections = Array.from(cdataMatches);
|
||
|
|
||
|
if (cdataSections.length > 0) {
|
||
|
cdataStats.filesWithCDATA++;
|
||
|
cdataStats.totalCDATASections += cdataSections.length;
|
||
|
|
||
|
// Analyze each CDATA section
|
||
|
for (const match of cdataSections) {
|
||
|
const cdataContent = match[1];
|
||
|
const cdataLength = cdataContent.length;
|
||
|
|
||
|
if (cdataLength > cdataStats.largestCDATA) {
|
||
|
cdataStats.largestCDATA = cdataLength;
|
||
|
}
|
||
|
|
||
|
// Try to find the parent element
|
||
|
const beforeCDATA = content.substring(Math.max(0, match.index! - 100), match.index);
|
||
|
const elementMatch = beforeCDATA.match(/<(\w+)[^>]*>\s*$/);
|
||
|
if (elementMatch) {
|
||
|
const element = elementMatch[1];
|
||
|
cdataStats.cdataByElement.set(
|
||
|
element,
|
||
|
(cdataStats.cdataByElement.get(element) || 0) + 1
|
||
|
);
|
||
|
}
|
||
|
|
||
|
// Detect common patterns
|
||
|
if (cdataContent.includes('<') && cdataContent.includes('>')) {
|
||
|
cdataStats.commonPatterns.set(
|
||
|
'XML/HTML content',
|
||
|
(cdataStats.commonPatterns.get('XML/HTML content') || 0) + 1
|
||
|
);
|
||
|
}
|
||
|
if (cdataContent.includes('&')) {
|
||
|
cdataStats.commonPatterns.set(
|
||
|
'Special characters',
|
||
|
(cdataStats.commonPatterns.get('Special characters') || 0) + 1
|
||
|
);
|
||
|
}
|
||
|
if (/^[A-Za-z0-9+/=\s]+$/.test(cdataContent.trim())) {
|
||
|
cdataStats.commonPatterns.set(
|
||
|
'Base64 data',
|
||
|
(cdataStats.commonPatterns.get('Base64 data') || 0) + 1
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} catch (error) {
|
||
|
// Skip files that can't be read
|
||
|
}
|
||
|
}
|
||
|
|
||
|
console.log('\nCDATA Usage Statistics:');
|
||
|
console.log(`Files analyzed: ${cdataStats.total}`);
|
||
|
console.log(`Files with CDATA: ${cdataStats.filesWithCDATA} (${(cdataStats.filesWithCDATA/cdataStats.total*100).toFixed(1)}%)`);
|
||
|
console.log(`Total CDATA sections: ${cdataStats.totalCDATASections}`);
|
||
|
console.log(`Largest CDATA section: ${cdataStats.largestCDATA} characters`);
|
||
|
|
||
|
if (cdataStats.cdataByElement.size > 0) {
|
||
|
console.log('\nCDATA usage by element:');
|
||
|
const sortedElements = Array.from(cdataStats.cdataByElement.entries())
|
||
|
.sort((a, b) => b[1] - a[1])
|
||
|
.slice(0, 5);
|
||
|
|
||
|
for (const [element, count] of sortedElements) {
|
||
|
console.log(` <${element}>: ${count} occurrences`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (cdataStats.commonPatterns.size > 0) {
|
||
|
console.log('\nCommon CDATA content patterns:');
|
||
|
for (const [pattern, count] of cdataStats.commonPatterns.entries()) {
|
||
|
console.log(` ${pattern}: ${count} occurrences`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
performanceTracker.endOperation('corpus-cdata');
|
||
|
});
|
||
|
|
||
|
// Performance summary
|
||
|
console.log('\n' + performanceTracker.getSummary());
|
||
|
|
||
|
// CDATA best practices
|
||
|
console.log('\nCDATA Section Handling Best Practices:');
|
||
|
console.log('1. Use CDATA for content with many special characters');
|
||
|
console.log('2. Prefer CDATA for embedded HTML/XML snippets');
|
||
|
console.log('3. Be aware that CDATA cannot be nested');
|
||
|
console.log('4. Handle ]]> sequence in content by splitting sections');
|
||
|
console.log('5. Remember CDATA is not allowed in attributes');
|
||
|
console.log('6. Consider performance impact for large documents');
|
||
|
console.log('7. Use for base64 data and complex text content');
|
||
|
console.log('8. Preserve CDATA sections in round-trip operations');
|
||
|
});
|
||
|
|
||
|
tap.start();
|