import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-10: CDATA Section Handling - Process CDATA sections correctly', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-10');
await t.test('Basic CDATA sections', async () => {
performanceTracker.startOperation('basic-cdata');
const cdataTests = [
{
name: 'Simple CDATA content',
xml: `
`,
expectedContent: 'This is plain text content',
description: 'Basic CDATA section'
},
{
name: 'CDATA with special characters',
xml: `
5]]>
`,
expectedContent: 'Price < 100 & quantity > 5',
description: 'Special characters preserved'
},
{
name: 'CDATA with XML-like content',
xml: `
This is HTML content
]]>
`,
expectedContent: 'This is HTML content
',
description: 'XML markup as text'
},
{
name: 'Empty CDATA section',
xml: `
`,
expectedContent: '',
description: 'Empty CDATA is valid'
},
{
name: 'CDATA with line breaks',
xml: `
`,
expectedContent: 'Line 1\nLine 2\nLine 3',
description: 'Preserves formatting'
}
];
for (const test of cdataTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
console.log(` Description: ${test.description}`);
console.log(` Expected content: "${test.expectedContent}"`);
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(test.xml);
console.log(' ✓ CDATA parsed successfully');
} else {
console.log(' ⚠️ Cannot test without fromXmlString');
}
} catch (error) {
console.log(` ✗ Error: ${error.message}`);
}
performanceTracker.recordMetric('cdata-parsing', performance.now() - startTime);
}
performanceTracker.endOperation('basic-cdata');
});
await t.test('CDATA edge cases', async () => {
performanceTracker.startOperation('cdata-edge-cases');
const edgeCases = [
{
name: 'Nested CDATA-like content',
xml: `
`,
note: 'CDATA end sequence in content needs escaping',
challenge: 'Cannot nest CDATA sections'
},
{
name: 'CDATA end sequence in content',
xml: `
`,
note: 'End sequence must be escaped',
challenge: 'Split ]]> into ]] and >'
},
{
name: 'Multiple CDATA sections',
xml: `
Normal text
`,
note: 'Multiple CDATA in same element',
challenge: 'Proper content concatenation'
},
{
name: 'CDATA in attributes (invalid)',
xml: `
- Content
`,
note: 'CDATA not allowed in attributes',
challenge: 'Should cause parse error'
},
{
name: 'Whitespace around CDATA',
xml: `
`,
note: 'Whitespace outside CDATA preserved',
challenge: 'Handle mixed content correctly'
}
];
for (const test of edgeCases) {
const startTime = performance.now();
console.log(`\n${test.name}:`);
console.log(` Note: ${test.note}`);
console.log(` Challenge: ${test.challenge}`);
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(test.xml);
console.log(' Result: Parsed successfully');
}
} catch (error) {
console.log(` Result: ${error.message}`);
}
performanceTracker.recordMetric('edge-case', performance.now() - startTime);
}
performanceTracker.endOperation('cdata-edge-cases');
});
await t.test('CDATA vs escaped content comparison', async () => {
performanceTracker.startOperation('cdata-vs-escaped');
const comparisonTests = [
{
name: 'Special characters',
cdata: ' 5]]>',
escaped: 'Price < 100 & quantity > 5',
content: 'Price < 100 & quantity > 5'
},
{
name: 'HTML snippet',
cdata: 'Content]]>',
escaped: '<div class="invoice">Content</div>',
content: 'Content
'
},
{
name: 'Code snippet',
cdata: '
',
escaped: 'if (a && b) { return "result"; }
',
content: 'if (a && b) { return "result"; }'
},
{
name: 'Quote marks',
cdata: '
',
escaped: 'He said "Hello" and she said 'Hi'
',
content: 'He said "Hello" and she said \'Hi\''
}
];
console.log('CDATA vs Escaped Content:');
for (const test of comparisonTests) {
console.log(`\n${test.name}:`);
console.log(` Expected content: "${test.content}"`);
console.log(` CDATA approach: More readable, preserves content as-is`);
console.log(` Escaped approach: Standard XML, but less readable`);
// Compare sizes
const cdataSize = Buffer.byteLength(test.cdata, 'utf8');
const escapedSize = Buffer.byteLength(test.escaped, 'utf8');
console.log(` Size comparison: CDATA=${cdataSize}B, Escaped=${escapedSize}B`);
if (cdataSize < escapedSize) {
console.log(` CDATA is ${escapedSize - cdataSize} bytes smaller`);
} else {
console.log(` Escaped is ${cdataSize - escapedSize} bytes smaller`);
}
}
performanceTracker.endOperation('cdata-vs-escaped');
});
await t.test('CDATA in e-invoice contexts', async () => {
performanceTracker.startOperation('einvoice-cdata');
const einvoiceUseCases = [
{
name: 'Terms and conditions',
xml: `
`,
useCase: 'Legal text with special characters'
},
{
name: 'Product description with HTML',
xml: `
-
Premium Widget
- Dimension: 10cm x 5cm x 3cm
- Weight: < 500g
- Price: €99.99
]]>
`,
useCase: 'Rich text product descriptions'
},
{
name: 'Base64 encoded attachment',
xml: `
`,
useCase: 'Binary data encoding'
},
{
name: 'Custom XML extensions',
xml: `
Value with < and > chars
Complex & data
]]>
`,
useCase: 'Embedded XML without namespace conflicts'
}
];
for (const useCase of einvoiceUseCases) {
console.log(`\n${useCase.name}:`);
console.log(` Use case: ${useCase.useCase}`);
const startTime = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(useCase.xml);
console.log(' ✓ Valid e-invoice usage of CDATA');
}
} catch (error) {
console.log(` ⚠️ Parse result: ${error.message}`);
}
performanceTracker.recordMetric('einvoice-usecase', performance.now() - startTime);
}
performanceTracker.endOperation('einvoice-cdata');
});
await t.test('CDATA performance impact', async () => {
performanceTracker.startOperation('cdata-performance');
// Generate test documents with varying CDATA usage
const generateInvoiceWithCDATA = (cdataCount: number, cdataSize: number): string => {
let xml = '\n\n';
for (let i = 0; i < cdataCount; i++) {
const content = 'X'.repeat(cdataSize);
xml += ` \n`;
}
xml += '';
return xml;
};
const generateInvoiceEscaped = (fieldCount: number, contentSize: number): string => {
let xml = '\n\n';
for (let i = 0; i < fieldCount; i++) {
// Content with characters that need escaping
const content = 'X&<>X'.repeat(contentSize / 5);
const escaped = content.replace(/&/g, '&').replace(//g, '>');
xml += ` ${escaped}\n`;
}
xml += '';
return xml;
};
console.log('Performance comparison:');
const testConfigs = [
{ fields: 10, contentSize: 100 },
{ fields: 50, contentSize: 500 },
{ fields: 100, contentSize: 1000 }
];
for (const config of testConfigs) {
console.log(`\n${config.fields} fields, ${config.contentSize} chars each:`);
// Test CDATA version
const cdataXml = generateInvoiceWithCDATA(config.fields, config.contentSize);
const cdataSize = Buffer.byteLength(cdataXml, 'utf8');
const cdataStart = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(cdataXml);
}
} catch (e) {}
const cdataTime = performance.now() - cdataStart;
// Test escaped version
const escapedXml = generateInvoiceEscaped(config.fields, config.contentSize);
const escapedSize = Buffer.byteLength(escapedXml, 'utf8');
const escapedStart = performance.now();
try {
const invoice = new einvoice.EInvoice();
if (invoice.fromXmlString) {
await invoice.fromXmlString(escapedXml);
}
} catch (e) {}
const escapedTime = performance.now() - escapedStart;
console.log(` CDATA: ${cdataTime.toFixed(2)}ms (${(cdataSize/1024).toFixed(1)}KB)`);
console.log(` Escaped: ${escapedTime.toFixed(2)}ms (${(escapedSize/1024).toFixed(1)}KB)`);
console.log(` Difference: ${((escapedTime - cdataTime) / cdataTime * 100).toFixed(1)}%`);
performanceTracker.recordMetric(`perf-${config.fields}fields`, cdataTime);
}
performanceTracker.endOperation('cdata-performance');
});
await t.test('Corpus CDATA usage analysis', async () => {
performanceTracker.startOperation('corpus-cdata');
const corpusLoader = new CorpusLoader();
const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/);
console.log(`\nAnalyzing CDATA usage in ${xmlFiles.length} corpus files...`);
const cdataStats = {
total: 0,
filesWithCDATA: 0,
totalCDATASections: 0,
cdataByElement: new Map(),
largestCDATA: 0,
commonPatterns: new Map()
};
const sampleSize = Math.min(100, xmlFiles.length);
const sampledFiles = xmlFiles.slice(0, sampleSize);
for (const file of sampledFiles) {
cdataStats.total++;
try {
const content = await plugins.fs.readFile(file.path, 'utf8');
// Find all CDATA sections
const cdataMatches = content.matchAll(//g);
const cdataSections = Array.from(cdataMatches);
if (cdataSections.length > 0) {
cdataStats.filesWithCDATA++;
cdataStats.totalCDATASections += cdataSections.length;
// Analyze each CDATA section
for (const match of cdataSections) {
const cdataContent = match[1];
const cdataLength = cdataContent.length;
if (cdataLength > cdataStats.largestCDATA) {
cdataStats.largestCDATA = cdataLength;
}
// Try to find the parent element
const beforeCDATA = content.substring(Math.max(0, match.index! - 100), match.index);
const elementMatch = beforeCDATA.match(/<(\w+)[^>]*>\s*$/);
if (elementMatch) {
const element = elementMatch[1];
cdataStats.cdataByElement.set(
element,
(cdataStats.cdataByElement.get(element) || 0) + 1
);
}
// Detect common patterns
if (cdataContent.includes('<') && cdataContent.includes('>')) {
cdataStats.commonPatterns.set(
'XML/HTML content',
(cdataStats.commonPatterns.get('XML/HTML content') || 0) + 1
);
}
if (cdataContent.includes('&')) {
cdataStats.commonPatterns.set(
'Special characters',
(cdataStats.commonPatterns.get('Special characters') || 0) + 1
);
}
if (/^[A-Za-z0-9+/=\s]+$/.test(cdataContent.trim())) {
cdataStats.commonPatterns.set(
'Base64 data',
(cdataStats.commonPatterns.get('Base64 data') || 0) + 1
);
}
}
}
} catch (error) {
// Skip files that can't be read
}
}
console.log('\nCDATA Usage Statistics:');
console.log(`Files analyzed: ${cdataStats.total}`);
console.log(`Files with CDATA: ${cdataStats.filesWithCDATA} (${(cdataStats.filesWithCDATA/cdataStats.total*100).toFixed(1)}%)`);
console.log(`Total CDATA sections: ${cdataStats.totalCDATASections}`);
console.log(`Largest CDATA section: ${cdataStats.largestCDATA} characters`);
if (cdataStats.cdataByElement.size > 0) {
console.log('\nCDATA usage by element:');
const sortedElements = Array.from(cdataStats.cdataByElement.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 5);
for (const [element, count] of sortedElements) {
console.log(` <${element}>: ${count} occurrences`);
}
}
if (cdataStats.commonPatterns.size > 0) {
console.log('\nCommon CDATA content patterns:');
for (const [pattern, count] of cdataStats.commonPatterns.entries()) {
console.log(` ${pattern}: ${count} occurrences`);
}
}
performanceTracker.endOperation('corpus-cdata');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// CDATA best practices
console.log('\nCDATA Section Handling Best Practices:');
console.log('1. Use CDATA for content with many special characters');
console.log('2. Prefer CDATA for embedded HTML/XML snippets');
console.log('3. Be aware that CDATA cannot be nested');
console.log('4. Handle ]]> sequence in content by splitting sections');
console.log('5. Remember CDATA is not allowed in attributes');
console.log('6. Consider performance impact for large documents');
console.log('7. Use for base64 data and complex text content');
console.log('8. Preserve CDATA sections in round-trip operations');
});
tap.start();