import { expect, tap } from '@git.zone/tstest/tapbundle'; import * as einvoice from '../../../ts/index.js'; import * as plugins from '../../plugins.js'; import { CorpusLoader } from '../../helpers/corpus.loader.js'; import { PerformanceTracker } from '../../helpers/performance.tracker.js'; tap.test('PARSE-10: CDATA Section Handling - Process CDATA sections correctly', async (t) => { const performanceTracker = new PerformanceTracker('PARSE-10'); await t.test('Basic CDATA sections', async () => { performanceTracker.startOperation('basic-cdata'); const cdataTests = [ { name: 'Simple CDATA content', xml: ` `, expectedContent: 'This is plain text content', description: 'Basic CDATA section' }, { name: 'CDATA with special characters', xml: ` 5]]> `, expectedContent: 'Price < 100 & quantity > 5', description: 'Special characters preserved' }, { name: 'CDATA with XML-like content', xml: ` This is HTML content

]]> `, expectedContent: '

This is HTML content

', description: 'XML markup as text' }, { name: 'Empty CDATA section', xml: ` `, expectedContent: '', description: 'Empty CDATA is valid' }, { name: 'CDATA with line breaks', xml: `

`, expectedContent: 'Line 1\nLine 2\nLine 3', description: 'Preserves formatting' } ]; for (const test of cdataTests) { const startTime = performance.now(); console.log(`${test.name}:`); console.log(` Description: ${test.description}`); console.log(` Expected content: "${test.expectedContent}"`); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(test.xml); console.log(' ✓ CDATA parsed successfully'); } else { console.log(' ⚠️ Cannot test without fromXmlString'); } } catch (error) { console.log(` ✗ Error: ${error.message}`); } performanceTracker.recordMetric('cdata-parsing', performance.now() - startTime); } performanceTracker.endOperation('basic-cdata'); }); await t.test('CDATA edge cases', async () => { performanceTracker.startOperation('cdata-edge-cases'); const edgeCases = [ { name: 'Nested CDATA-like content', xml: ` `, note: 'CDATA end sequence in content needs escaping', challenge: 'Cannot nest CDATA sections' }, { name: 'CDATA end sequence in content', xml: ` `, note: 'End sequence must be escaped', challenge: 'Split ]]> into ]] and >' }, { name: 'Multiple CDATA sections', xml: ` Normal text `, note: 'Multiple CDATA in same element', challenge: 'Proper content concatenation' }, { name: 'CDATA in attributes (invalid)', xml: ` Content `, note: 'CDATA not allowed in attributes', challenge: 'Should cause parse error' }, { name: 'Whitespace around CDATA', xml: ` `, note: 'Whitespace outside CDATA preserved', challenge: 'Handle mixed content correctly' } ]; for (const test of edgeCases) { const startTime = performance.now(); console.log(`\n${test.name}:`); console.log(` Note: ${test.note}`); console.log(` Challenge: ${test.challenge}`); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(test.xml); console.log(' Result: Parsed successfully'); } } catch (error) { console.log(` Result: ${error.message}`); } performanceTracker.recordMetric('edge-case', performance.now() - startTime); } performanceTracker.endOperation('cdata-edge-cases'); }); await t.test('CDATA vs escaped content comparison', async () => { performanceTracker.startOperation('cdata-vs-escaped'); const comparisonTests = [ { name: 'Special characters', cdata: ' 5]]>', escaped: 'Price < 100 & quantity > 5', content: 'Price < 100 & quantity > 5' }, { name: 'HTML snippet', cdata: 'Content]]>', escaped: '<div class="invoice">Content</div>', content: '

Content

' }, { name: 'Code snippet', cdata: '', escaped: 'if (a && b) { return "result"; }', content: 'if (a && b) { return "result"; }' }, { name: 'Quote marks', cdata: '', escaped: 'He said "Hello" and she said 'Hi'', content: 'He said "Hello" and she said \'Hi\'' } ]; console.log('CDATA vs Escaped Content:'); for (const test of comparisonTests) { console.log(`\n${test.name}:`); console.log(` Expected content: "${test.content}"`); console.log(` CDATA approach: More readable, preserves content as-is`); console.log(` Escaped approach: Standard XML, but less readable`); // Compare sizes const cdataSize = Buffer.byteLength(test.cdata, 'utf8'); const escapedSize = Buffer.byteLength(test.escaped, 'utf8'); console.log(` Size comparison: CDATA=${cdataSize}B, Escaped=${escapedSize}B`); if (cdataSize < escapedSize) { console.log(` CDATA is ${escapedSize - cdataSize} bytes smaller`); } else { console.log(` Escaped is ${cdataSize - escapedSize} bytes smaller`); } } performanceTracker.endOperation('cdata-vs-escaped'); }); await t.test('CDATA in e-invoice contexts', async () => { performanceTracker.startOperation('einvoice-cdata'); const einvoiceUseCases = [ { name: 'Terms and conditions', xml: ` `, useCase: 'Legal text with special characters' }, { name: 'Product description with HTML', xml: ` Premium Widget

Dimension: 10cm x 5cm x 3cm
Weight: < 500g
Price: €99.99

]]> `, useCase: 'Rich text product descriptions' }, { name: 'Base64 encoded attachment', xml: ` `, useCase: 'Binary data encoding' }, { name: 'Custom XML extensions', xml: ` Value with < and > chars Complex & data ]]> `, useCase: 'Embedded XML without namespace conflicts' } ]; for (const useCase of einvoiceUseCases) { console.log(`\n${useCase.name}:`); console.log(` Use case: ${useCase.useCase}`); const startTime = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(useCase.xml); console.log(' ✓ Valid e-invoice usage of CDATA'); } } catch (error) { console.log(` ⚠️ Parse result: ${error.message}`); } performanceTracker.recordMetric('einvoice-usecase', performance.now() - startTime); } performanceTracker.endOperation('einvoice-cdata'); }); await t.test('CDATA performance impact', async () => { performanceTracker.startOperation('cdata-performance'); // Generate test documents with varying CDATA usage const generateInvoiceWithCDATA = (cdataCount: number, cdataSize: number): string => { let xml = '\n\n'; for (let i = 0; i < cdataCount; i++) { const content = 'X'.repeat(cdataSize); xml += ` \n`; } xml += ''; return xml; }; const generateInvoiceEscaped = (fieldCount: number, contentSize: number): string => { let xml = '\n\n'; for (let i = 0; i < fieldCount; i++) { // Content with characters that need escaping const content = 'X&<>X'.repeat(contentSize / 5); const escaped = content.replace(/&/g, '&').replace(//g, '>'); xml += ` ${escaped}\n`; } xml += ''; return xml; }; console.log('Performance comparison:'); const testConfigs = [ { fields: 10, contentSize: 100 }, { fields: 50, contentSize: 500 }, { fields: 100, contentSize: 1000 } ]; for (const config of testConfigs) { console.log(`\n${config.fields} fields, ${config.contentSize} chars each:`); // Test CDATA version const cdataXml = generateInvoiceWithCDATA(config.fields, config.contentSize); const cdataSize = Buffer.byteLength(cdataXml, 'utf8'); const cdataStart = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(cdataXml); } } catch (e) {} const cdataTime = performance.now() - cdataStart; // Test escaped version const escapedXml = generateInvoiceEscaped(config.fields, config.contentSize); const escapedSize = Buffer.byteLength(escapedXml, 'utf8'); const escapedStart = performance.now(); try { const invoice = new einvoice.EInvoice(); if (invoice.fromXmlString) { await invoice.fromXmlString(escapedXml); } } catch (e) {} const escapedTime = performance.now() - escapedStart; console.log(` CDATA: ${cdataTime.toFixed(2)}ms (${(cdataSize/1024).toFixed(1)}KB)`); console.log(` Escaped: ${escapedTime.toFixed(2)}ms (${(escapedSize/1024).toFixed(1)}KB)`); console.log(` Difference: ${((escapedTime - cdataTime) / cdataTime * 100).toFixed(1)}%`); performanceTracker.recordMetric(`perf-${config.fields}fields`, cdataTime); } performanceTracker.endOperation('cdata-performance'); }); await t.test('Corpus CDATA usage analysis', async () => { performanceTracker.startOperation('corpus-cdata'); const corpusLoader = new CorpusLoader(); const xmlFiles = await corpusLoader.getFiles(/\.(xml|ubl|cii)$/); console.log(`\nAnalyzing CDATA usage in ${xmlFiles.length} corpus files...`); const cdataStats = { total: 0, filesWithCDATA: 0, totalCDATASections: 0, cdataByElement: new Map(), largestCDATA: 0, commonPatterns: new Map() }; const sampleSize = Math.min(100, xmlFiles.length); const sampledFiles = xmlFiles.slice(0, sampleSize); for (const file of sampledFiles) { cdataStats.total++; try { const content = await plugins.fs.readFile(file.path, 'utf8'); // Find all CDATA sections const cdataMatches = content.matchAll(//g); const cdataSections = Array.from(cdataMatches); if (cdataSections.length > 0) { cdataStats.filesWithCDATA++; cdataStats.totalCDATASections += cdataSections.length; // Analyze each CDATA section for (const match of cdataSections) { const cdataContent = match[1]; const cdataLength = cdataContent.length; if (cdataLength > cdataStats.largestCDATA) { cdataStats.largestCDATA = cdataLength; } // Try to find the parent element const beforeCDATA = content.substring(Math.max(0, match.index! - 100), match.index); const elementMatch = beforeCDATA.match(/<(\w+)[^>]*>\s*$/); if (elementMatch) { const element = elementMatch[1]; cdataStats.cdataByElement.set( element, (cdataStats.cdataByElement.get(element) || 0) + 1 ); } // Detect common patterns if (cdataContent.includes('<') && cdataContent.includes('>')) { cdataStats.commonPatterns.set( 'XML/HTML content', (cdataStats.commonPatterns.get('XML/HTML content') || 0) + 1 ); } if (cdataContent.includes('&')) { cdataStats.commonPatterns.set( 'Special characters', (cdataStats.commonPatterns.get('Special characters') || 0) + 1 ); } if (/^[A-Za-z0-9+/=\s]+$/.test(cdataContent.trim())) { cdataStats.commonPatterns.set( 'Base64 data', (cdataStats.commonPatterns.get('Base64 data') || 0) + 1 ); } } } } catch (error) { // Skip files that can't be read } } console.log('\nCDATA Usage Statistics:'); console.log(`Files analyzed: ${cdataStats.total}`); console.log(`Files with CDATA: ${cdataStats.filesWithCDATA} (${(cdataStats.filesWithCDATA/cdataStats.total*100).toFixed(1)}%)`); console.log(`Total CDATA sections: ${cdataStats.totalCDATASections}`); console.log(`Largest CDATA section: ${cdataStats.largestCDATA} characters`); if (cdataStats.cdataByElement.size > 0) { console.log('\nCDATA usage by element:'); const sortedElements = Array.from(cdataStats.cdataByElement.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, 5); for (const [element, count] of sortedElements) { console.log(` <${element}>: ${count} occurrences`); } } if (cdataStats.commonPatterns.size > 0) { console.log('\nCommon CDATA content patterns:'); for (const [pattern, count] of cdataStats.commonPatterns.entries()) { console.log(` ${pattern}: ${count} occurrences`); } } performanceTracker.endOperation('corpus-cdata'); }); // Performance summary console.log('\n' + performanceTracker.getSummary()); // CDATA best practices console.log('\nCDATA Section Handling Best Practices:'); console.log('1. Use CDATA for content with many special characters'); console.log('2. Prefer CDATA for embedded HTML/XML snippets'); console.log('3. Be aware that CDATA cannot be nested'); console.log('4. Handle ]]> sequence in content by splitting sections'); console.log('5. Remember CDATA is not allowed in attributes'); console.log('6. Consider performance impact for large documents'); console.log('7. Use for base64 data and complex text content'); console.log('8. Preserve CDATA sections in round-trip operations'); }); tap.start();