einvoice/test/suite/einvoice_encoding/test.enc-08.mixed-content.ts
2025-05-25 19:45:37 +00:00

462 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as plugins from '../plugins.js';
import { EInvoice } from '../../../ts/index.js';
import { CorpusLoader } from '../corpus.loader.js';
import { PerformanceTracker } from '../performance.tracker.js';
tap.test('ENC-08: Mixed Content Encoding - should handle mixed content (text and elements) correctly', async (t) => {
// ENC-08: Verify proper encoding of mixed content scenarios
// This test ensures text nodes, elements, CDATA, and comments are properly encoded together
const performanceTracker = new PerformanceTracker('ENC-08: Mixed Content');
const corpusLoader = new CorpusLoader();
t.test('Basic mixed content', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-BASIC-001</ID>
<Note>
This invoice includes <emphasis>important</emphasis> payment terms:
<term>Net 30 days</term> with <percentage>2%</percentage> early payment discount.
Please pay by <date>2025-02-25</date>.
</Note>
<PaymentTerms>
<Note>
Payment due in <days>30</days> days.
<condition>If paid within <days>10</days> days: <discount>2%</discount> discount</condition>
<condition>If paid after <days>30</days> days: <penalty>1.5%</penalty> interest</condition>
</Note>
</PaymentTerms>
<InvoiceLine>
<Note>
Item includes <quantity>10</quantity> units of <product>Widget A</product>
at <price currency="EUR">€9.99</price> each.
Total: <total currency="EUR">€99.90</total>
</Note>
</InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify mixed content is preserved
expect(xmlString).toContain('This invoice includes');
expect(xmlString).toContain('<emphasis>important</emphasis>');
expect(xmlString).toContain('payment terms:');
expect(xmlString).toContain('<term>Net 30 days</term>');
expect(xmlString).toContain('with');
expect(xmlString).toContain('<percentage>2%</percentage>');
expect(xmlString).toContain('Please pay by');
expect(xmlString).toContain('<date>2025-02-25</date>');
// Verify nested mixed content
expect(xmlString).toContain('If paid within');
expect(xmlString).toContain('<days>10</days>');
expect(xmlString).toContain('days:');
expect(xmlString).toContain('<discount>2%</discount>');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('basic-mixed', elapsed);
});
t.test('Mixed content with special characters', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-SPECIAL-001</ID>
<Note>
Price: <amount>100.00</amount> € (VAT <percentage>19%</percentage> = <vat>19.00</vat> €)
Total: <total>119.00</total> € for <company>Müller &amp; Söhne GmbH</company>
</Note>
<DocumentReference>
<DocumentDescription>
See contract <ref>§12.3</ref> for terms &amp; conditions.
<important>Payment &lt; 30 days</important> required.
Contact: <email>info@müller-söhne.de</email>
</DocumentDescription>
</DocumentReference>
<PaymentTerms>
<Note>
<condition type="discount">≥ 100 items → 5% discount</condition>
<condition type="penalty">&gt; 30 days → 1.5% interest</condition>
<formula>Total = Price × Quantity × (1 + VAT%)</formula>
</Note>
</PaymentTerms>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify special characters in mixed content
expect(xmlString).toContain('Price:');
expect(xmlString).toContain('€');
expect(xmlString).toContain('Müller &amp; Söhne GmbH');
expect(xmlString).toContain('§12.3');
expect(xmlString).toContain('terms &amp; conditions');
expect(xmlString).toContain('&lt; 30 days');
expect(xmlString).toContain('info@müller-söhne.de');
expect(xmlString).toContain('≥ 100 items → 5% discount');
expect(xmlString).toContain('&gt; 30 days → 1.5% interest');
expect(xmlString).toContain('×');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('special-mixed', elapsed);
});
t.test('Mixed content with CDATA sections', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-CDATA-001</ID>
<Note>
Regular text before CDATA.
<![CDATA[This section contains <unescaped> tags & special chars: < > & " ']]>
Text after CDATA with <element>nested element</element>.
</Note>
<AdditionalDocumentReference>
<DocumentDescription>
HTML content example:
<![CDATA[
<html>
<body>
<h1>Invoice Details</h1>
<p>Amount: €100.00</p>
<p>VAT: 19%</p>
</body>
</html>
]]>
End of description.
</DocumentDescription>
</AdditionalDocumentReference>
<PaymentTerms>
<Note>
Formula: <formula>price * quantity</formula>
<![CDATA[JavaScript: if (amount > 100) { discount = 5%; }]]>
Applied to all items.
</Note>
</PaymentTerms>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify mixed content with CDATA is handled
expect(xmlString).toContain('Regular text before CDATA');
expect(xmlString).toContain('Text after CDATA');
expect(xmlString).toContain('<element>nested element</element>');
// CDATA content should be preserved somehow
if (xmlString.includes('CDATA')) {
expect(xmlString).toContain('<![CDATA[');
expect(xmlString).toContain(']]>');
} else {
// Or converted to escaped text
expect(xmlString).toMatch(/&lt;unescaped&gt;|<unescaped>/);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('cdata-mixed', elapsed);
});
t.test('Mixed content with comments', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-COMMENTS-001</ID>
<Note>
<!-- Start of payment terms -->
Payment is due in <days>30</days> days.
<!-- Discount information follows -->
<discount>Early payment: 2% if paid within 10 days</discount>
<!-- End of payment terms -->
</Note>
<DocumentReference>
<DocumentDescription>
See attachment <!-- PDF document --> for details.
<attachment>invoice.pdf</attachment> <!-- 2 pages -->
Contact <!-- via email -->: <email>info@example.com</email>
</DocumentDescription>
</DocumentReference>
<InvoiceLine>
<!-- Line item 1 -->
<Note>
Product: <name>Widget</name> <!-- Best seller -->
Quantity: <qty>10</qty> <!-- In stock -->
Price: <price>9.99</price> <!-- EUR -->
</Note>
</InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify text content is preserved (comments may or may not be preserved)
expect(xmlString).toContain('Payment is due in');
expect(xmlString).toContain('<days>30</days>');
expect(xmlString).toContain('days.');
expect(xmlString).toContain('<discount>Early payment: 2% if paid within 10 days</discount>');
expect(xmlString).toContain('See attachment');
expect(xmlString).toContain('for details.');
expect(xmlString).toContain('<attachment>invoice.pdf</attachment>');
expect(xmlString).toContain('Contact');
expect(xmlString).toContain('<email>info@example.com</email>');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('comments-mixed', elapsed);
});
t.test('Whitespace preservation in mixed content', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-WHITESPACE-001</ID>
<Note>Text with multiple spaces and
newlines should be preserved.
<element>Indented element</element>
More text with tabs between words.
</Note>
<PaymentTerms>
<Note xml:space="preserve"> Leading spaces
<term>Net 30</term> Trailing spaces
Middle spaces preserved.
End with spaces </Note>
</PaymentTerms>
<DocumentReference>
<DocumentDescription>Line 1
<break/>
Line 2
<break/>
Line 3</DocumentDescription>
</DocumentReference>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Whitespace handling varies by implementation
expect(xmlString).toContain('Text with');
expect(xmlString).toContain('spaces');
expect(xmlString).toContain('<element>Indented element</element>');
expect(xmlString).toContain('More text with');
expect(xmlString).toContain('words');
// xml:space="preserve" should maintain whitespace
if (xmlString.includes('xml:space="preserve"')) {
expect(xmlString).toMatch(/Leading spaces|^\s+Leading/m);
}
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('whitespace-mixed', elapsed);
});
t.test('Deeply nested mixed content', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-NESTED-001</ID>
<Note>
Level 1: Invoice for <customer>
<name>ABC Corp</name> (Customer ID: <id>C-12345</id>)
<address>
Located at <street>123 Main St</street>,
<city>New York</city>, <state>NY</state> <zip>10001</zip>
</address>
</customer> dated <date>2025-01-25</date>.
</Note>
<PaymentTerms>
<Note>
<terms>
Standard terms: <standard>
Net <days>30</days> days from <reference>
invoice date (<date>2025-01-25</date>)
</reference>
</standard>
<special>
Special conditions: <condition num="1">
For orders &gt; <amount currency="EUR">€1000</amount>:
<discount>5%</discount> discount
</condition>
</special>
</terms>
</Note>
</PaymentTerms>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify deeply nested structure is preserved
expect(xmlString).toContain('Level 1: Invoice for');
expect(xmlString).toContain('<customer>');
expect(xmlString).toContain('<name>ABC Corp</name>');
expect(xmlString).toContain('(Customer ID:');
expect(xmlString).toContain('<id>C-12345</id>');
expect(xmlString).toContain('Located at');
expect(xmlString).toContain('<street>123 Main St</street>');
expect(xmlString).toContain('<city>New York</city>');
expect(xmlString).toContain('<state>NY</state>');
expect(xmlString).toContain('<zip>10001</zip>');
expect(xmlString).toContain('dated');
expect(xmlString).toContain('<date>2025-01-25</date>');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('nested-mixed', elapsed);
});
t.test('International mixed content', async () => {
const startTime = performance.now();
const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2">
<UBLVersionID>2.1</UBLVersionID>
<ID>MIXED-INTL-001</ID>
<Note>
Invoice for <company lang="de">Müller GmbH</company> from <city>München</city>.
Total: <amount currency="EUR">€1.234,56</amount> (inkl. <tax>19% MwSt</tax>).
支付条款:<terms lang="zh">30天内付款</terms>。
お支払い: <terms lang="ja">30日以内</terms>。
</Note>
<PaymentTerms>
<Note>
<multilang>
<en>Payment due in <days>30</days> days</en>
<de>Zahlung fällig in <days>30</days> Tagen</de>
<fr>Paiement dû dans <days>30</days> jours</fr>
<es>Pago debido en <days>30</days> días</es>
</multilang>
</Note>
</PaymentTerms>
<InvoiceLine>
<Note>
Product: <name lang="multi">
<en>Book</en> / <de>Buch</de> / <fr>Livre</fr> /
<zh>书</zh> / <ja>本</ja> / <ar>كتاب</ar>
</name>
Price: <price>€25.00</price> per <unit>Stück</unit>
</Note>
</InvoiceLine>
</Invoice>`;
const einvoice = new EInvoice();
await einvoice.loadFromString(xmlContent);
const xmlString = einvoice.getXmlString();
// Verify international mixed content
expect(xmlString).toContain('Müller GmbH');
expect(xmlString).toContain('München');
expect(xmlString).toContain('€1.234,56');
expect(xmlString).toContain('19% MwSt');
expect(xmlString).toContain('支付条款:');
expect(xmlString).toContain('30天内付款');
expect(xmlString).toContain('お支払い:');
expect(xmlString).toContain('30日以内');
expect(xmlString).toContain('Zahlung fällig in');
expect(xmlString).toContain('Tagen');
expect(xmlString).toContain('Paiement dû dans');
expect(xmlString).toContain('书');
expect(xmlString).toContain('本');
expect(xmlString).toContain('كتاب');
expect(xmlString).toContain('Stück');
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('intl-mixed', elapsed);
});
t.test('Corpus mixed content analysis', async () => {
const startTime = performance.now();
let processedCount = 0;
let mixedContentCount = 0;
const mixedContentExamples: string[] = [];
const files = await corpusLoader.getAllFiles();
const xmlFiles = files.filter(f => f.endsWith('.xml'));
// Sample corpus for mixed content patterns
const sampleSize = Math.min(60, xmlFiles.length);
const sample = xmlFiles.slice(0, sampleSize);
for (const file of sample) {
try {
const content = await corpusLoader.readFile(file);
let xmlString: string;
if (Buffer.isBuffer(content)) {
xmlString = content.toString('utf8');
} else {
xmlString = content;
}
// Look for mixed content patterns
// Pattern: text followed by element followed by text within same parent
const mixedPattern = />([^<]+)<[^>]+>[^<]+<\/[^>]+>([^<]+)</;
if (mixedPattern.test(xmlString)) {
mixedContentCount++;
// Extract example
const match = xmlString.match(mixedPattern);
if (match && mixedContentExamples.length < 5) {
mixedContentExamples.push(`${file}: "${match[0].substring(0, 100)}..."`);
}
}
// Also check for CDATA sections
if (xmlString.includes('<![CDATA[')) {
if (!mixedContentExamples.some(ex => ex.includes('CDATA'))) {
mixedContentExamples.push(`${file}: Contains CDATA sections`);
}
}
processedCount++;
} catch (error) {
console.log(`Mixed content parsing issue in ${file}:`, error.message);
}
}
console.log(`Mixed content corpus analysis (${processedCount} files):`);
console.log(`- Files with mixed content patterns: ${mixedContentCount}`);
if (mixedContentExamples.length > 0) {
console.log('Mixed content examples:');
mixedContentExamples.forEach(ex => console.log(` ${ex}`));
}
expect(processedCount).toBeGreaterThan(0);
const elapsed = performance.now() - startTime;
performanceTracker.addMeasurement('corpus-mixed', elapsed);
});
// Print performance summary
performanceTracker.printSummary();
// Performance assertions
const avgTime = performanceTracker.getAverageTime();
expect(avgTime).toBeLessThan(150); // Mixed content operations may be slightly slower
});
tap.start();