554 lines
19 KiB
TypeScript
554 lines
19 KiB
TypeScript
|
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
|||
|
import * as einvoice from '../../../ts/index.js';
|
|||
|
import * as plugins from '../../plugins.js';
|
|||
|
import { CorpusLoader } from '../../helpers/corpus.loader.js';
|
|||
|
import { PerformanceTracker } from '../../helpers/performance.tracker.js';
|
|||
|
|
|||
|
tap.test('PARSE-03: Character Encoding Detection - Detect and handle various character encodings', async (t) => {
|
|||
|
const performanceTracker = new PerformanceTracker('PARSE-03');
|
|||
|
|
|||
|
await t.test('Encoding declaration detection', async () => {
|
|||
|
performanceTracker.startOperation('declaration-detection');
|
|||
|
|
|||
|
const encodingTests = [
|
|||
|
{
|
|||
|
name: 'UTF-8 declaration',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-8"?>\n<invoice><id>TEST-001</id></invoice>',
|
|||
|
expectedEncoding: 'UTF-8',
|
|||
|
actualEncoding: 'UTF-8'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-16 declaration',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-16"?>\n<invoice><id>TEST-002</id></invoice>',
|
|||
|
expectedEncoding: 'UTF-16',
|
|||
|
actualEncoding: 'UTF-8' // Mismatch test
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'ISO-8859-1 declaration',
|
|||
|
xml: '<?xml version="1.0" encoding="ISO-8859-1"?>\n<invoice><supplier>Müller</supplier></invoice>',
|
|||
|
expectedEncoding: 'ISO-8859-1',
|
|||
|
actualEncoding: 'ISO-8859-1'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'Windows-1252 declaration',
|
|||
|
xml: '<?xml version="1.0" encoding="Windows-1252"?>\n<invoice><note>Special – chars</note></invoice>',
|
|||
|
expectedEncoding: 'Windows-1252',
|
|||
|
actualEncoding: 'Windows-1252'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'Case variations',
|
|||
|
xml: '<?xml version="1.0" encoding="utf-8"?>\n<invoice><id>TEST-003</id></invoice>',
|
|||
|
expectedEncoding: 'UTF-8',
|
|||
|
actualEncoding: 'UTF-8'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'No encoding declaration',
|
|||
|
xml: '<?xml version="1.0"?>\n<invoice><id>TEST-004</id></invoice>',
|
|||
|
expectedEncoding: 'UTF-8', // Default
|
|||
|
actualEncoding: 'UTF-8'
|
|||
|
}
|
|||
|
];
|
|||
|
|
|||
|
for (const test of encodingTests) {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Extract declared encoding
|
|||
|
const encodingMatch = test.xml.match(/encoding=["']([^"']+)["']/i);
|
|||
|
const declaredEncoding = encodingMatch ? encodingMatch[1].toUpperCase() : 'UTF-8';
|
|||
|
|
|||
|
console.log(`${test.name}:`);
|
|||
|
console.log(` Declared: ${declaredEncoding}`);
|
|||
|
console.log(` Expected: ${test.expectedEncoding}`);
|
|||
|
|
|||
|
if (declaredEncoding.replace(/-/g, '').toUpperCase() ===
|
|||
|
test.expectedEncoding.replace(/-/g, '').toUpperCase()) {
|
|||
|
console.log(' ✓ Declaration matches expected encoding');
|
|||
|
} else {
|
|||
|
console.log(' ✗ Declaration mismatch');
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.recordMetric('encoding-detection', performance.now() - startTime);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.endOperation('declaration-detection');
|
|||
|
});
|
|||
|
|
|||
|
await t.test('BOM (Byte Order Mark) detection', async () => {
|
|||
|
performanceTracker.startOperation('bom-detection');
|
|||
|
|
|||
|
const bomTests = [
|
|||
|
{
|
|||
|
name: 'UTF-8 with BOM',
|
|||
|
bom: Buffer.from([0xEF, 0xBB, 0xBF]),
|
|||
|
encoding: 'UTF-8',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-005</id></invoice>'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-16 LE BOM',
|
|||
|
bom: Buffer.from([0xFF, 0xFE]),
|
|||
|
encoding: 'UTF-16LE',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-006</id></invoice>'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-16 BE BOM',
|
|||
|
bom: Buffer.from([0xFE, 0xFF]),
|
|||
|
encoding: 'UTF-16BE',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-16"?><invoice><id>TEST-007</id></invoice>'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-32 LE BOM',
|
|||
|
bom: Buffer.from([0xFF, 0xFE, 0x00, 0x00]),
|
|||
|
encoding: 'UTF-32LE',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-008</id></invoice>'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-32 BE BOM',
|
|||
|
bom: Buffer.from([0x00, 0x00, 0xFE, 0xFF]),
|
|||
|
encoding: 'UTF-32BE',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-32"?><invoice><id>TEST-009</id></invoice>'
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'No BOM',
|
|||
|
bom: Buffer.from([]),
|
|||
|
encoding: 'UTF-8',
|
|||
|
xml: '<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST-010</id></invoice>'
|
|||
|
}
|
|||
|
];
|
|||
|
|
|||
|
for (const test of bomTests) {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
// Create buffer with BOM
|
|||
|
const xmlBuffer = Buffer.from(test.xml, 'utf8');
|
|||
|
const fullBuffer = Buffer.concat([test.bom, xmlBuffer]);
|
|||
|
|
|||
|
// Detect BOM
|
|||
|
let detectedEncoding = 'UTF-8'; // Default
|
|||
|
|
|||
|
if (fullBuffer.length >= 4) {
|
|||
|
if (fullBuffer[0] === 0xEF && fullBuffer[1] === 0xBB && fullBuffer[2] === 0xBF) {
|
|||
|
detectedEncoding = 'UTF-8';
|
|||
|
} else if (fullBuffer[0] === 0xFF && fullBuffer[1] === 0xFE) {
|
|||
|
if (fullBuffer[2] === 0x00 && fullBuffer[3] === 0x00) {
|
|||
|
detectedEncoding = 'UTF-32LE';
|
|||
|
} else {
|
|||
|
detectedEncoding = 'UTF-16LE';
|
|||
|
}
|
|||
|
} else if (fullBuffer[0] === 0xFE && fullBuffer[1] === 0xFF) {
|
|||
|
detectedEncoding = 'UTF-16BE';
|
|||
|
} else if (fullBuffer[0] === 0x00 && fullBuffer[1] === 0x00 &&
|
|||
|
fullBuffer[2] === 0xFE && fullBuffer[3] === 0xFF) {
|
|||
|
detectedEncoding = 'UTF-32BE';
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
console.log(`${test.name}:`);
|
|||
|
console.log(` BOM bytes: ${test.bom.length > 0 ? Array.from(test.bom).map(b => '0x' + b.toString(16).toUpperCase()).join(' ') : 'None'}`);
|
|||
|
console.log(` Expected: ${test.encoding}`);
|
|||
|
console.log(` Detected: ${detectedEncoding}`);
|
|||
|
|
|||
|
if (detectedEncoding === test.encoding ||
|
|||
|
(test.bom.length === 0 && detectedEncoding === 'UTF-8')) {
|
|||
|
console.log(' ✓ BOM detection correct');
|
|||
|
} else {
|
|||
|
console.log(' ✗ BOM detection failed');
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.recordMetric('bom-detection', performance.now() - startTime);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.endOperation('bom-detection');
|
|||
|
});
|
|||
|
|
|||
|
await t.test('Heuristic encoding detection', async () => {
|
|||
|
performanceTracker.startOperation('heuristic-detection');
|
|||
|
|
|||
|
class EncodingDetector {
|
|||
|
detectEncoding(buffer: Buffer): { encoding: string; confidence: number; method: string } {
|
|||
|
// Check for BOM first
|
|||
|
const bomResult = this.checkBOM(buffer);
|
|||
|
if (bomResult) {
|
|||
|
return { ...bomResult, confidence: 100, method: 'BOM' };
|
|||
|
}
|
|||
|
|
|||
|
// Check XML declaration
|
|||
|
const declResult = this.checkXmlDeclaration(buffer);
|
|||
|
if (declResult) {
|
|||
|
return { ...declResult, confidence: 90, method: 'XML Declaration' };
|
|||
|
}
|
|||
|
|
|||
|
// Heuristic checks
|
|||
|
const heuristicResult = this.heuristicCheck(buffer);
|
|||
|
return { ...heuristicResult, method: 'Heuristic' };
|
|||
|
}
|
|||
|
|
|||
|
private checkBOM(buffer: Buffer): { encoding: string } | null {
|
|||
|
if (buffer.length < 2) return null;
|
|||
|
|
|||
|
if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
|||
|
return { encoding: 'UTF-8' };
|
|||
|
}
|
|||
|
if (buffer[0] === 0xFF && buffer[1] === 0xFE) {
|
|||
|
return { encoding: 'UTF-16LE' };
|
|||
|
}
|
|||
|
if (buffer[0] === 0xFE && buffer[1] === 0xFF) {
|
|||
|
return { encoding: 'UTF-16BE' };
|
|||
|
}
|
|||
|
|
|||
|
return null;
|
|||
|
}
|
|||
|
|
|||
|
private checkXmlDeclaration(buffer: Buffer): { encoding: string } | null {
|
|||
|
// Look for encoding in first 100 bytes
|
|||
|
const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
|
|||
|
const match = sample.match(/encoding=["']([^"']+)["']/i);
|
|||
|
|
|||
|
if (match) {
|
|||
|
return { encoding: match[1].toUpperCase() };
|
|||
|
}
|
|||
|
|
|||
|
return null;
|
|||
|
}
|
|||
|
|
|||
|
private heuristicCheck(buffer: Buffer): { encoding: string; confidence: number } {
|
|||
|
const sampleSize = Math.min(1000, buffer.length);
|
|||
|
|
|||
|
// Check for null bytes (indicates UTF-16/32)
|
|||
|
let nullBytes = 0;
|
|||
|
let highBytes = 0;
|
|||
|
let validUtf8 = true;
|
|||
|
|
|||
|
for (let i = 0; i < sampleSize; i++) {
|
|||
|
if (buffer[i] === 0) nullBytes++;
|
|||
|
if (buffer[i] > 127) highBytes++;
|
|||
|
|
|||
|
// Simple UTF-8 validation
|
|||
|
if (buffer[i] > 127) {
|
|||
|
if ((buffer[i] & 0xE0) === 0xC0) {
|
|||
|
// 2-byte sequence
|
|||
|
if (i + 1 >= sampleSize || (buffer[i + 1] & 0xC0) !== 0x80) {
|
|||
|
validUtf8 = false;
|
|||
|
}
|
|||
|
i++;
|
|||
|
} else if ((buffer[i] & 0xF0) === 0xE0) {
|
|||
|
// 3-byte sequence
|
|||
|
if (i + 2 >= sampleSize ||
|
|||
|
(buffer[i + 1] & 0xC0) !== 0x80 ||
|
|||
|
(buffer[i + 2] & 0xC0) !== 0x80) {
|
|||
|
validUtf8 = false;
|
|||
|
}
|
|||
|
i += 2;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Decision logic
|
|||
|
if (nullBytes > sampleSize * 0.3) {
|
|||
|
return { encoding: 'UTF-16', confidence: 70 };
|
|||
|
}
|
|||
|
|
|||
|
if (validUtf8 && highBytes > 0) {
|
|||
|
return { encoding: 'UTF-8', confidence: 85 };
|
|||
|
}
|
|||
|
|
|||
|
if (highBytes > sampleSize * 0.3) {
|
|||
|
return { encoding: 'ISO-8859-1', confidence: 60 };
|
|||
|
}
|
|||
|
|
|||
|
return { encoding: 'UTF-8', confidence: 50 }; // Default
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
const detector = new EncodingDetector();
|
|||
|
|
|||
|
const testBuffers = [
|
|||
|
{
|
|||
|
name: 'Pure ASCII',
|
|||
|
content: Buffer.from('<?xml version="1.0"?><invoice><id>TEST-011</id></invoice>')
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-8 with special chars',
|
|||
|
content: Buffer.from('<?xml version="1.0"?><invoice><name>Café €100</name></invoice>')
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'ISO-8859-1 content',
|
|||
|
content: Buffer.from([
|
|||
|
0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, // <invoice>
|
|||
|
0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // <name>
|
|||
|
0xC4, 0xD6, 0xDC, // ÄÖÜ in ISO-8859-1
|
|||
|
0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, // </name>
|
|||
|
0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E // </invoice>
|
|||
|
])
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'UTF-16 with nulls',
|
|||
|
content: Buffer.from('invoice', 'utf16le')
|
|||
|
}
|
|||
|
];
|
|||
|
|
|||
|
for (const test of testBuffers) {
|
|||
|
const result = detector.detectEncoding(test.content);
|
|||
|
|
|||
|
console.log(`${test.name}:`);
|
|||
|
console.log(` Detected: ${result.encoding}`);
|
|||
|
console.log(` Confidence: ${result.confidence}%`);
|
|||
|
console.log(` Method: ${result.method}`);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.endOperation('heuristic-detection');
|
|||
|
});
|
|||
|
|
|||
|
await t.test('Multi-encoding document handling', async () => {
|
|||
|
performanceTracker.startOperation('multi-encoding');
|
|||
|
|
|||
|
const multiEncodingTests = [
|
|||
|
{
|
|||
|
name: 'Declaration vs actual mismatch',
|
|||
|
declared: 'UTF-8',
|
|||
|
actual: 'ISO-8859-1',
|
|||
|
content: Buffer.from([
|
|||
|
// <?xml version="1.0" encoding="UTF-8"?>
|
|||
|
0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x3D,
|
|||
|
0x22, 0x31, 0x2E, 0x30, 0x22, 0x20, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67,
|
|||
|
0x3D, 0x22, 0x55, 0x54, 0x46, 0x2D, 0x38, 0x22, 0x3F, 0x3E,
|
|||
|
// <invoice><name>
|
|||
|
0x3C, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E, 0x3C, 0x6E, 0x61, 0x6D, 0x65, 0x3E,
|
|||
|
// Müller in ISO-8859-1
|
|||
|
0x4D, 0xFC, 0x6C, 0x6C, 0x65, 0x72,
|
|||
|
// </name></invoice>
|
|||
|
0x3C, 0x2F, 0x6E, 0x61, 0x6D, 0x65, 0x3E, 0x3C, 0x2F, 0x69, 0x6E, 0x76, 0x6F, 0x69, 0x63, 0x65, 0x3E
|
|||
|
])
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'Mixed encoding in attributes',
|
|||
|
content: `<?xml version="1.0" encoding="UTF-8"?>
|
|||
|
<invoice currency="€" supplier="Müller & Co.">
|
|||
|
<amount>100.00</amount>
|
|||
|
</invoice>`
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'Entity-encoded special chars',
|
|||
|
content: `<?xml version="1.0" encoding="ASCII"?>
|
|||
|
<invoice>
|
|||
|
<supplier>Müller</supplier>
|
|||
|
<amount>€100</amount>
|
|||
|
</invoice>`
|
|||
|
}
|
|||
|
];
|
|||
|
|
|||
|
for (const test of multiEncodingTests) {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
console.log(`${test.name}:`);
|
|||
|
|
|||
|
if (test.declared && test.actual) {
|
|||
|
console.log(` Declared: ${test.declared}`);
|
|||
|
console.log(` Actual: ${test.actual}`);
|
|||
|
console.log(` ⚠️ Encoding mismatch detected`);
|
|||
|
}
|
|||
|
|
|||
|
try {
|
|||
|
const invoice = new einvoice.EInvoice();
|
|||
|
const content = test.content instanceof Buffer ? test.content : test.content;
|
|||
|
|
|||
|
if (invoice.fromXmlString && typeof content === 'string') {
|
|||
|
await invoice.fromXmlString(content);
|
|||
|
console.log(' ✓ Parsed successfully');
|
|||
|
} else if (invoice.fromBuffer && content instanceof Buffer) {
|
|||
|
await invoice.fromBuffer(content);
|
|||
|
console.log(' ✓ Parsed from buffer');
|
|||
|
}
|
|||
|
} catch (error) {
|
|||
|
console.log(` ✗ Parse error: ${error.message}`);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.recordMetric('multi-encoding', performance.now() - startTime);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.endOperation('multi-encoding');
|
|||
|
});
|
|||
|
|
|||
|
await t.test('Corpus encoding analysis', async () => {
|
|||
|
performanceTracker.startOperation('corpus-encoding');
|
|||
|
|
|||
|
const corpusLoader = new CorpusLoader();
|
|||
|
const xmlFiles = await corpusLoader.getFiles(/\.xml$/);
|
|||
|
|
|||
|
console.log(`\nAnalyzing encodings in ${xmlFiles.length} corpus files...`);
|
|||
|
|
|||
|
const encodingStats = {
|
|||
|
total: 0,
|
|||
|
byDeclaration: new Map<string, number>(),
|
|||
|
byBOM: { withBOM: 0, withoutBOM: 0 },
|
|||
|
conflicts: 0,
|
|||
|
errors: 0
|
|||
|
};
|
|||
|
|
|||
|
const sampleSize = Math.min(100, xmlFiles.length);
|
|||
|
const sampledFiles = xmlFiles.slice(0, sampleSize);
|
|||
|
|
|||
|
for (const file of sampledFiles) {
|
|||
|
encodingStats.total++;
|
|||
|
|
|||
|
try {
|
|||
|
const buffer = await plugins.fs.readFile(file.path);
|
|||
|
|
|||
|
// Check for BOM
|
|||
|
if (buffer.length >= 3 &&
|
|||
|
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
|||
|
encodingStats.byBOM.withBOM++;
|
|||
|
} else {
|
|||
|
encodingStats.byBOM.withoutBOM++;
|
|||
|
}
|
|||
|
|
|||
|
// Check declaration
|
|||
|
const sample = buffer.toString('utf8', 0, Math.min(200, buffer.length));
|
|||
|
const match = sample.match(/encoding=["']([^"']+)["']/i);
|
|||
|
|
|||
|
if (match) {
|
|||
|
const encoding = match[1].toUpperCase();
|
|||
|
encodingStats.byDeclaration.set(
|
|||
|
encoding,
|
|||
|
(encodingStats.byDeclaration.get(encoding) || 0) + 1
|
|||
|
);
|
|||
|
} else {
|
|||
|
encodingStats.byDeclaration.set(
|
|||
|
'NONE',
|
|||
|
(encodingStats.byDeclaration.get('NONE') || 0) + 1
|
|||
|
);
|
|||
|
}
|
|||
|
} catch (error) {
|
|||
|
encodingStats.errors++;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
console.log('\nEncoding Statistics:');
|
|||
|
console.log(`Total files analyzed: ${encodingStats.total}`);
|
|||
|
console.log(`Files with BOM: ${encodingStats.byBOM.withBOM}`);
|
|||
|
console.log(`Files without BOM: ${encodingStats.byBOM.withoutBOM}`);
|
|||
|
console.log('\nDeclared encodings:');
|
|||
|
|
|||
|
const sortedEncodings = Array.from(encodingStats.byDeclaration.entries())
|
|||
|
.sort((a, b) => b[1] - a[1]);
|
|||
|
|
|||
|
for (const [encoding, count] of sortedEncodings) {
|
|||
|
const percentage = (count / encodingStats.total * 100).toFixed(1);
|
|||
|
console.log(` ${encoding}: ${count} (${percentage}%)`);
|
|||
|
}
|
|||
|
|
|||
|
console.log(`\nRead errors: ${encodingStats.errors}`);
|
|||
|
|
|||
|
performanceTracker.endOperation('corpus-encoding');
|
|||
|
});
|
|||
|
|
|||
|
await t.test('Encoding conversion and normalization', async () => {
|
|||
|
performanceTracker.startOperation('encoding-conversion');
|
|||
|
|
|||
|
class EncodingNormalizer {
|
|||
|
async normalizeToUTF8(buffer: Buffer, sourceEncoding?: string): Promise<Buffer> {
|
|||
|
// Detect encoding if not provided
|
|||
|
if (!sourceEncoding) {
|
|||
|
sourceEncoding = this.detectSourceEncoding(buffer);
|
|||
|
}
|
|||
|
|
|||
|
// Skip if already UTF-8
|
|||
|
if (sourceEncoding === 'UTF-8') {
|
|||
|
// Just remove BOM if present
|
|||
|
if (buffer.length >= 3 &&
|
|||
|
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
|||
|
return buffer.slice(3);
|
|||
|
}
|
|||
|
return buffer;
|
|||
|
}
|
|||
|
|
|||
|
// Convert to UTF-8
|
|||
|
try {
|
|||
|
const decoder = new TextDecoder(sourceEncoding.toLowerCase());
|
|||
|
const text = decoder.decode(buffer);
|
|||
|
|
|||
|
// Update encoding declaration
|
|||
|
const updatedText = text.replace(
|
|||
|
/encoding=["'][^"']+["']/i,
|
|||
|
'encoding="UTF-8"'
|
|||
|
);
|
|||
|
|
|||
|
return Buffer.from(updatedText, 'utf8');
|
|||
|
} catch (error) {
|
|||
|
throw new Error(`Encoding conversion failed: ${error.message}`);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
private detectSourceEncoding(buffer: Buffer): string {
|
|||
|
// Simple detection logic
|
|||
|
if (buffer.length >= 3 &&
|
|||
|
buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
|
|||
|
return 'UTF-8';
|
|||
|
}
|
|||
|
|
|||
|
const sample = buffer.toString('ascii', 0, Math.min(100, buffer.length));
|
|||
|
const match = sample.match(/encoding=["']([^"']+)["']/i);
|
|||
|
|
|||
|
return match ? match[1].toUpperCase() : 'UTF-8';
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
const normalizer = new EncodingNormalizer();
|
|||
|
|
|||
|
const conversionTests = [
|
|||
|
{
|
|||
|
name: 'UTF-8 with BOM to UTF-8 without BOM',
|
|||
|
input: Buffer.concat([
|
|||
|
Buffer.from([0xEF, 0xBB, 0xBF]),
|
|||
|
Buffer.from('<?xml version="1.0" encoding="UTF-8"?><invoice><id>TEST</id></invoice>')
|
|||
|
])
|
|||
|
},
|
|||
|
{
|
|||
|
name: 'ISO-8859-1 to UTF-8',
|
|||
|
input: Buffer.from('<?xml version="1.0" encoding="ISO-8859-1"?><invoice><name>Test</name></invoice>')
|
|||
|
}
|
|||
|
];
|
|||
|
|
|||
|
for (const test of conversionTests) {
|
|||
|
const startTime = performance.now();
|
|||
|
|
|||
|
try {
|
|||
|
const normalized = await normalizer.normalizeToUTF8(test.input);
|
|||
|
|
|||
|
console.log(`${test.name}:`);
|
|||
|
console.log(` Input size: ${test.input.length} bytes`);
|
|||
|
console.log(` Output size: ${normalized.length} bytes`);
|
|||
|
console.log(` ✓ Conversion successful`);
|
|||
|
|
|||
|
// Verify no BOM in output
|
|||
|
if (normalized.length >= 3 &&
|
|||
|
normalized[0] === 0xEF && normalized[1] === 0xBB && normalized[2] === 0xBF) {
|
|||
|
console.log(' ✗ BOM still present in output');
|
|||
|
} else {
|
|||
|
console.log(' ✓ BOM removed');
|
|||
|
}
|
|||
|
} catch (error) {
|
|||
|
console.log(`${test.name}: ✗ Conversion failed - ${error.message}`);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.recordMetric('encoding-conversion', performance.now() - startTime);
|
|||
|
}
|
|||
|
|
|||
|
performanceTracker.endOperation('encoding-conversion');
|
|||
|
});
|
|||
|
|
|||
|
// Performance summary
|
|||
|
console.log('\n' + performanceTracker.getSummary());
|
|||
|
|
|||
|
// Encoding detection best practices
|
|||
|
console.log('\nCharacter Encoding Detection Best Practices:');
|
|||
|
console.log('1. Always check for BOM before parsing');
|
|||
|
console.log('2. Verify declared encoding matches actual encoding');
|
|||
|
console.log('3. Use heuristics when declaration is missing');
|
|||
|
console.log('4. Handle encoding mismatches gracefully');
|
|||
|
console.log('5. Normalize to UTF-8 for consistent processing');
|
|||
|
console.log('6. Preserve original encoding information for round-trip');
|
|||
|
console.log('7. Support common legacy encodings (ISO-8859-1, Windows-1252)');
|
|||
|
console.log('8. Test with real-world data that includes various encodings');
|
|||
|
});
|
|||
|
|
|||
|
tap.start();
|