einvoice/test/suite/einvoice_parsing/test.parse-03.encoding-detection.ts

554 lines
19 KiB
TypeScript
Raw Normal View History

2025-05-25 19:45:37 +00:00
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as einvoice from '../../../ts/index.js';
import * as plugins from '../../plugins.js';
import { CorpusLoader } from '../../helpers/corpus.loader.js';
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
tap.test('PARSE-03: Character Encoding Detection - Detect and handle various character encodings', async (t) => {
const performanceTracker = new PerformanceTracker('PARSE-03');
await t.test('Encoding declaration detection', async () => {
performanceTracker.startOperation('declaration-detection');
const encodingTests = [
{
name: 'UTF-8 declaration',
xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
expectedEncoding: 'UTF-8',
actualEncoding: 'UTF-8'
},
{
name: 'UTF-16 declaration',
xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
expectedEncoding: 'UTF-16',
actualEncoding: 'UTF-8' // Mismatch test
},
{
name: 'ISO-8859-1 declaration',
xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
expectedEncoding: 'ISO-8859-1',
actualEncoding: 'ISO-8859-1'
},
{
name: 'Windows-1252 declaration',
xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special chars</note></invoice>',
expectedEncoding: 'Windows-1252',
actualEncoding: 'Windows-1252'
},
{
name: 'Case variations',
xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
expectedEncoding: 'UTF-8',
actualEncoding: 'UTF-8'
},
{
name: 'No encoding declaration',
xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
expectedEncoding: 'UTF-8', // Default
actualEncoding: 'UTF-8'
}
];
for (const test of encodingTests) {
const startTime = performance.now();
// Extract declared encoding
const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
console.log(`${test.name}:`);
console.log(` Declared: ${declaredEncoding}`);
console.log(` Expected: ${test.expectedEncoding}`);
if (declaredEncoding.replace(/-/g, '').toUpperCase() ===
test.expectedEncoding.replace(/-/g, '').toUpperCase()) {
console.log(' ✓ Declaration matches expected encoding');
} else {
console.log(' ✗ Declaration mismatch');
}
performanceTracker.recordMetric('encoding-detection', performance.now() - startTime);
}
performanceTracker.endOperation('declaration-detection');
});
await t.test('BOM (Byte Order Mark) detection', async () => {
performanceTracker.startOperation('bom-detection');
const bomTests = [
{
name: 'UTF-8 with BOM',
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
encoding: 'UTF-8',
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
},
{
name: 'UTF-16 LE BOM',
bom: Buffer.from([0xFF, 0xFE]),
encoding: 'UTF-16LE',
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
},
{
name: 'UTF-16 BE BOM',
bom: Buffer.from([0xFE, 0xFF]),
encoding: 'UTF-16BE',
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
},
{
name: 'UTF-32 LE BOM',
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
encoding: 'UTF-32LE',
xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-008</id></invoice>'
},
{
name: 'UTF-32 BE BOM',
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
encoding: 'UTF-32BE',
xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-009</id></invoice>'
},
{
name: 'No BOM',
bom: Buffer.from([]),
encoding: 'UTF-8',
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-010</id></invoice>'
}
];
for (const test of bomTests) {
const startTime = performance.now();
// Create buffer with BOM
const xmlBuffer = Buffer.from(test.xml, 'utf8');
const fullBuffer = Buffer.concat([test.bom, xmlBuffer]);
// Detect BOM
let detectedEncoding = 'UTF-8'; // Default
if (fullBuffer.length >= 4) {
if (fullBuffer[0] === 0xEF && fullBuffer[1] === 0xBB && fullBuffer[2] === 0xBF) {
detectedEncoding = 'UTF-8';
} else if (fullBuffer[0] === 0xFF && fullBuffer[1] === 0xFE) {
if (fullBuffer[2] === 0x00 && fullBuffer[3] === 0x00) {
detectedEncoding = 'UTF-32LE';
} else {
detectedEncoding = 'UTF-16LE';
}
} else if (fullBuffer[0] === 0xFE && fullBuffer[1] === 0xFF) {
detectedEncoding = 'UTF-16BE';
} else if (fullBuffer[0] === 0x00 && fullBuffer[1] === 0x00 &&
fullBuffer[2] === 0xFE && fullBuffer[3] === 0xFF) {
detectedEncoding = 'UTF-32BE';
}
}
console.log(`${test.name}:`);
console.log(` BOM bytes: ${test.bom.length > 0 ? Array.from(test.bom).map(b => '0x' + b.toString(16).toUpperCase()).join(' ') : 'None'}`);
console.log(` Expected: ${test.encoding}`);
console.log(` Detected: ${detectedEncoding}`);
if (detectedEncoding === test.encoding ||
(test.bom.length === 0 && detectedEncoding === 'UTF-8')) {
console.log(' ✓ BOM detection correct');
} else {
console.log(' ✗ BOM detection failed');
}
performanceTracker.recordMetric('bom-detection', performance.now() - startTime);
}
performanceTracker.endOperation('bom-detection');
});
await t.test('Heuristic encoding detection', async () => {
performanceTracker.startOperation('heuristic-detection');
class EncodingDetector {
detectEncoding(buffer: Buffer): { encoding: string; confidence: number; method: string } {
// Check for BOM first
const bomResult = this.checkBOM(buffer);
if (bomResult) {
return { ...bomResult, confidence: 100, method: 'BOM' };
}
// Check XML declaration
const declResult = this.checkXmlDeclaration(buffer);
if (declResult) {
return { ...declResult, confidence: 90, method: 'XML Declaration' };
}
// Heuristic checks
const heuristicResult = this.heuristicCheck(buffer);
return { ...heuristicResult, method: 'Heuristic' };
}
private checkBOM(buffer: Buffer): { encoding: string } | null {
if (buffer.length < 2) return null;
if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
return { encoding: 'UTF-8' };
}
if (buffer[0] === 0xFF && buffer[1] === 0xFE) {
return { encoding: 'UTF-16LE' };
}
if (buffer[0] === 0xFE && buffer[1] === 0xFF) {
return { encoding: 'UTF-16BE' };
}
return null;
}
private checkXmlDeclaration(buffer: Buffer): { encoding: string } | null {
// Look for encoding in first 100 bytes
const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
const match = sample.match(/encoding=["']([^"']+)["']/i);
if (match) {
return { encoding: match[1].toUpperCase() };
}
return null;
}
private heuristicCheck(buffer: Buffer): { encoding: string; confidence: number } {
const sampleSize = Math.min(1000, buffer.length);
// Check for null bytes (indicates UTF-16/32)
let nullBytes = 0;
let highBytes = 0;
let validUtf8 = true;
for (let i = 0; i < sampleSize; i++) {
if (buffer[i] === 0) nullBytes++;
if (buffer[i] > 127) highBytes++;
// Simple UTF-8 validation
if (buffer[i] > 127) {
if ((buffer[i] & 0xE0) === 0xC0) {
// 2-byte sequence
if (i + 1 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80) {
validUtf8 = false;
}
i++;
} else if ((buffer[i] & 0xF0) === 0xE0) {
// 3-byte sequence
if (i + 2 >= sampleSize ||
(buffer[i + 1] & 0xC0) !== 0x80 ||
(buffer[i + 2] & 0xC0) !== 0x80) {
validUtf8 = false;
}
i += 2;
}
}
}
// Decision logic
if (nullBytes > sampleSize * 0.3) {
return { encoding: 'UTF-16', confidence: 70 };
}
if (validUtf8 && highBytes > 0) {
return { encoding: 'UTF-8', confidence: 85 };
}
if (highBytes > sampleSize * 0.3) {
return { encoding: 'ISO-8859-1', confidence: 60 };
}
return { encoding: 'UTF-8', confidence: 50 }; // Default
}
}
const detector = new EncodingDetector();
const testBuffers = [
{
name: 'Pure ASCII',
content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-011</id></invoice>')
},
{
name: 'UTF-8 with special chars',
content: Buffer.from('<?xml version="1.0"?><invoice><name>Café €100</name></invoice>')
},
{
name: 'ISO-8859-1 content',
content: Buffer.from([
0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // <invoice>
0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // <name>
0xC4, 0xD6, 0xDC, // ÄÖÜ in ISO-8859-1
0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // </name>
0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // </invoice>
])
},
{
name: 'UTF-16 with nulls',
content: Buffer.from('invoice', 'utf16le')
}
];
for (const test of testBuffers) {
const result = detector.detectEncoding(test.content);
console.log(`${test.name}:`);
console.log(` Detected: ${result.encoding}`);
console.log(` Confidence: ${result.confidence}%`);
console.log(` Method: ${result.method}`);
}
performanceTracker.endOperation('heuristic-detection');
});
await t.test('Multi-encoding document handling', async () => {
performanceTracker.startOperation('multi-encoding');
const multiEncodingTests = [
{
name: 'Declaration vs actual mismatch',
declared: 'UTF-8',
actual: 'ISO-8859-1',
content: Buffer.from([
// <?xml version="1.0" encoding="UTF-8"?>
0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x3D,
0x22, 0x31, 0x2E, 0x30, 0x22, 0x20, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67,
0x3D, 0x22, 0x55, 0x54, 0x46, 0x2D, 0x38, 0x22, 0x3F, 0x3E,
// <invoice><name>
0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E,
// Müller in ISO-8859-1
0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72,
// </name></invoice>
0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E
])
},
{
name: 'Mixed encoding in attributes',
content: `<?xml version="1.0" encoding="UTF-8"?>
<invoice currency="€" supplier="Müller & Co.">
<amount>100.00</amount>
</invoice>`
},
{
name: 'Entity-encoded special chars',
content: `<?xml version="1.0" encoding="ASCII"?>
<invoice>
<supplier>M&#252;ller</supplier>
<amount>&#8364;100</amount>
</invoice>`
}
];
for (const test of multiEncodingTests) {
const startTime = performance.now();
console.log(`${test.name}:`);
if (test.declared && test.actual) {
console.log(` Declared: ${test.declared}`);
console.log(` Actual: ${test.actual}`);
console.log(` ⚠️ Encoding mismatch detected`);
}
try {
const invoice = new einvoice.EInvoice();
const content = test.content instanceof Buffer ? test.content : test.content;
if (invoice.fromXmlString && typeof content === 'string') {
await invoice.fromXmlString(content);
console.log(' ✓ Parsed successfully');
} else if (invoice.fromBuffer && content instanceof Buffer) {
await invoice.fromBuffer(content);
console.log(' ✓ Parsed from buffer');
}
} catch (error) {
console.log(` ✗ Parse error: ${error.message}`);
}
performanceTracker.recordMetric('multi-encoding', performance.now() - startTime);
}
performanceTracker.endOperation('multi-encoding');
});
await t.test('Corpus encoding analysis', async () => {
performanceTracker.startOperation('corpus-encoding');
const corpusLoader = new CorpusLoader();
const xmlFiles = await corpusLoader.getFiles(/\.xml$/);
console.log(`\nAnalyzing encodings in ${xmlFiles.length} corpus files...`);
const encodingStats = {
total: 0,
byDeclaration: new Map<string, number>(),
byBOM: { withBOM: 0, withoutBOM: 0 },
conflicts: 0,
errors: 0
};
const sampleSize = Math.min(100, xmlFiles.length);
const sampledFiles = xmlFiles.slice(0, sampleSize);
for (const file of sampledFiles) {
encodingStats.total++;
try {
const buffer = await plugins.fs.readFile(file.path);
// Check for BOM
if (buffer.length >= 3 &&
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
encodingStats.byBOM.withBOM++;
} else {
encodingStats.byBOM.withoutBOM++;
}
// Check declaration
const sample = buffer.toString('utf8', 0, Math.min(200, buffer.length));
const match = sample.match(/encoding=["']([^"']+)["']/i);
if (match) {
const encoding = match[1].toUpperCase();
encodingStats.byDeclaration.set(
encoding,
(encodingStats.byDeclaration.get(encoding) || 0) + 1
);
} else {
encodingStats.byDeclaration.set(
'NONE',
(encodingStats.byDeclaration.get('NONE') || 0) + 1
);
}
} catch (error) {
encodingStats.errors++;
}
}
console.log('\nEncoding Statistics:');
console.log(`Total files analyzed: ${encodingStats.total}`);
console.log(`Files with BOM: ${encodingStats.byBOM.withBOM}`);
console.log(`Files without BOM: ${encodingStats.byBOM.withoutBOM}`);
console.log('\nDeclared encodings:');
const sortedEncodings = Array.from(encodingStats.byDeclaration.entries())
.sort((a, b) => b[1] - a[1]);
for (const [encoding, count] of sortedEncodings) {
const percentage = (count / encodingStats.total * 100).toFixed(1);
console.log(` ${encoding}: ${count} (${percentage}%)`);
}
console.log(`\nRead errors: ${encodingStats.errors}`);
performanceTracker.endOperation('corpus-encoding');
});
await t.test('Encoding conversion and normalization', async () => {
performanceTracker.startOperation('encoding-conversion');
class EncodingNormalizer {
async normalizeToUTF8(buffer: Buffer, sourceEncoding?: string): Promise<Buffer> {
// Detect encoding if not provided
if (!sourceEncoding) {
sourceEncoding = this.detectSourceEncoding(buffer);
}
// Skip if already UTF-8
if (sourceEncoding === 'UTF-8') {
// Just remove BOM if present
if (buffer.length >= 3 &&
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
return buffer.slice(3);
}
return buffer;
}
// Convert to UTF-8
try {
const decoder = new TextDecoder(sourceEncoding.toLowerCase());
const text = decoder.decode(buffer);
// Update encoding declaration
const updatedText = text.replace(
/encoding=["'][^"']+["']/i,
'encoding="UTF-8"'
);
return Buffer.from(updatedText, 'utf8');
} catch (error) {
throw new Error(`Encoding conversion failed: ${error.message}`);
}
}
private detectSourceEncoding(buffer: Buffer): string {
// Simple detection logic
if (buffer.length >= 3 &&
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
return 'UTF-8';
}
const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
const match = sample.match(/encoding=["']([^"']+)["']/i);
return match ? match[1].toUpperCase() : 'UTF-8';
}
}
const normalizer = new EncodingNormalizer();
const conversionTests = [
{
name: 'UTF-8 with BOM to UTF-8 without BOM',
input: Buffer.concat([
Buffer.from([0xEF, 0xBB, 0xBF]),
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
])
},
{
name: 'ISO-8859-1 to UTF-8',
input: Buffer.from('<?xml version="1.0" encoding="ISO-8859-1"?><invoice><name>Test</name></invoice>')
}
];
for (const test of conversionTests) {
const startTime = performance.now();
try {
const normalized = await normalizer.normalizeToUTF8(test.input);
console.log(`${test.name}:`);
console.log(` Input size: ${test.input.length} bytes`);
console.log(` Output size: ${normalized.length} bytes`);
console.log(` ✓ Conversion successful`);
// Verify no BOM in output
if (normalized.length >= 3 &&
normalized[0] === 0xEF && normalized[1] === 0xBB && normalized[2] === 0xBF) {
console.log(' ✗ BOM still present in output');
} else {
console.log(' ✓ BOM removed');
}
} catch (error) {
console.log(`${test.name}: ✗ Conversion failed - ${error.message}`);
}
performanceTracker.recordMetric('encoding-conversion', performance.now() - startTime);
}
performanceTracker.endOperation('encoding-conversion');
});
// Performance summary
console.log('\n' + performanceTracker.getSummary());
// Encoding detection best practices
console.log('\nCharacter Encoding Detection Best Practices:');
console.log('1. Always check for BOM before parsing');
console.log('2. Verify declared encoding matches actual encoding');
console.log('3. Use heuristics when declaration is missing');
console.log('4. Handle encoding mismatches gracefully');
console.log('5. Normalize to UTF-8 for consistent processing');
console.log('6. Preserve original encoding information for round-trip');
console.log('7. Support common legacy encodings (ISO-8859-1, Windows-1252)');
console.log('8. Test with real-world data that includes various encodings');
});
tap.start();