2025-05-25 19:45:37 +00:00
|
|
|
|
/**
|
|
|
|
|
* @file test.conv-11.encoding-edge-cases.ts
|
|
|
|
|
* @description Tests for character encoding edge cases and special scenarios during conversion
|
|
|
|
|
*/
|
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
import { expect, tap } from '@git.zone/tstest/tapbundle';
|
2025-05-25 19:45:37 +00:00
|
|
|
|
import * as plugins from '../../plugins.js';
|
|
|
|
|
import { EInvoice } from '../../../ts/index.js';
|
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
tap.test('CONV-11: Character Encoding - should handle special characters in XML', async () => {
|
|
|
|
|
const einvoice = new EInvoice();
|
|
|
|
|
const results = {
|
|
|
|
|
utf8Preserved: false,
|
|
|
|
|
specialCharsPreserved: false,
|
|
|
|
|
emojiHandled: false,
|
|
|
|
|
multiLanguagePreserved: false
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Test UTF-8 special characters
|
|
|
|
|
const utf8Invoice = `<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
|
|
|
|
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
|
|
|
|
|
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
|
|
|
|
<cbc:ID>ENC-UTF8-2024-001</cbc:ID>
|
|
|
|
|
<cbc:IssueDate>2024-01-28</cbc:IssueDate>
|
|
|
|
|
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
|
|
|
|
|
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
|
|
|
|
|
<cac:AccountingSupplierParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>UTF-8 Société Française €</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
<cac:PostalAddress>
|
|
|
|
|
<cbc:StreetName>Rue de la Paix № 42</cbc:StreetName>
|
|
|
|
|
<cbc:CityName>Paris</cbc:CityName>
|
|
|
|
|
<cbc:PostalZone>75001</cbc:PostalZone>
|
|
|
|
|
<cac:Country>
|
|
|
|
|
<cbc:IdentificationCode>FR</cbc:IdentificationCode>
|
|
|
|
|
</cac:Country>
|
|
|
|
|
</cac:PostalAddress>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingSupplierParty>
|
|
|
|
|
<cac:AccountingCustomerParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>Käufer GmbH & Co. KG</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
<cac:PostalAddress>
|
|
|
|
|
<cbc:StreetName>Hauptstraße 123½</cbc:StreetName>
|
|
|
|
|
<cbc:CityName>Berlin</cbc:CityName>
|
|
|
|
|
<cbc:PostalZone>10115</cbc:PostalZone>
|
|
|
|
|
<cac:Country>
|
|
|
|
|
<cbc:IdentificationCode>DE</cbc:IdentificationCode>
|
|
|
|
|
</cac:Country>
|
|
|
|
|
</cac:PostalAddress>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingCustomerParty>
|
|
|
|
|
<cac:InvoiceLine>
|
|
|
|
|
<cbc:ID>1</cbc:ID>
|
|
|
|
|
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
|
|
|
|
|
<cbc:LineExtensionAmount currencyID="EUR">99.99</cbc:LineExtensionAmount>
|
|
|
|
|
<cac:Item>
|
|
|
|
|
<cbc:Name>Spécialité française – Délicieux</cbc:Name>
|
|
|
|
|
</cac:Item>
|
|
|
|
|
<cac:Price>
|
|
|
|
|
<cbc:PriceAmount currencyID="EUR">99.99</cbc:PriceAmount>
|
|
|
|
|
</cac:Price>
|
|
|
|
|
</cac:InvoiceLine>
|
|
|
|
|
<cac:LegalMonetaryTotal>
|
|
|
|
|
<cbc:PayableAmount currencyID="EUR">119.99</cbc:PayableAmount>
|
|
|
|
|
</cac:LegalMonetaryTotal>
|
|
|
|
|
</Invoice>`;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await einvoice.loadXml(utf8Invoice);
|
|
|
|
|
const exportedXml = await einvoice.toXmlString('ubl');
|
|
|
|
|
|
|
|
|
|
// Check if special characters are preserved
|
|
|
|
|
results.utf8Preserved = exportedXml.includes('€') &&
|
|
|
|
|
exportedXml.includes('№') &&
|
|
|
|
|
exportedXml.includes('–') &&
|
|
|
|
|
exportedXml.includes('½');
|
|
|
|
|
|
|
|
|
|
// Check specific field preservation
|
|
|
|
|
results.specialCharsPreserved = einvoice.from?.name?.includes('€') &&
|
|
|
|
|
einvoice.to?.name?.includes('ä');
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.log('UTF-8 test error:', error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log('UTF-8 Special Characters:');
|
|
|
|
|
console.log(` - UTF-8 preserved in XML: ${results.utf8Preserved}`);
|
|
|
|
|
console.log(` - Special chars in data: ${results.specialCharsPreserved}`);
|
|
|
|
|
|
|
|
|
|
expect(results.utf8Preserved).toEqual(true);
|
|
|
|
|
});
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
tap.test('CONV-11: Character Encoding - should handle Unicode normalization', async () => {
|
|
|
|
|
// Test with different Unicode normalization forms
|
|
|
|
|
const testCases = [
|
|
|
|
|
{
|
|
|
|
|
name: 'NFC vs NFD',
|
|
|
|
|
text1: 'café', // NFC: é as single character
|
|
|
|
|
text2: 'café', // NFD: e + combining acute accent
|
|
|
|
|
shouldMatch: true
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
name: 'Precomposed vs Decomposed',
|
|
|
|
|
text1: 'Å', // Precomposed
|
|
|
|
|
text2: 'Å', // A + ring above
|
|
|
|
|
shouldMatch: true
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
name: 'Complex diacritics',
|
|
|
|
|
text1: 'Việt Nam',
|
|
|
|
|
text2: 'Việt Nam', // Different composition
|
|
|
|
|
shouldMatch: true
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
const results = [];
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
for (const testCase of testCases) {
|
|
|
|
|
const invoice = `<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
|
|
|
|
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
|
|
|
|
|
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
|
|
|
|
<cbc:ID>NORM-${testCase.name.replace(/\s+/g, '-')}</cbc:ID>
|
|
|
|
|
<cbc:IssueDate>2024-01-28</cbc:IssueDate>
|
|
|
|
|
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
|
|
|
|
|
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
|
|
|
|
|
<cac:AccountingSupplierParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>${testCase.text1}</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingSupplierParty>
|
|
|
|
|
<cac:AccountingCustomerParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>${testCase.text2}</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingCustomerParty>
|
|
|
|
|
<cac:LegalMonetaryTotal>
|
|
|
|
|
<cbc:PayableAmount currencyID="EUR">100.00</cbc:PayableAmount>
|
|
|
|
|
</cac:LegalMonetaryTotal>
|
|
|
|
|
</Invoice>`;
|
|
|
|
|
|
|
|
|
|
try {
|
2025-05-25 19:45:37 +00:00
|
|
|
|
const einvoice = new EInvoice();
|
2025-05-26 13:33:21 +00:00
|
|
|
|
await einvoice.loadXml(invoice);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
// Check if normalized strings are handled correctly
|
|
|
|
|
const sellerMatch = einvoice.from?.name === testCase.text1 ||
|
|
|
|
|
einvoice.from?.name?.normalize('NFC') === testCase.text1.normalize('NFC');
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
results.push({
|
|
|
|
|
testCase: testCase.name,
|
|
|
|
|
preserved: sellerMatch,
|
|
|
|
|
original: testCase.text1,
|
|
|
|
|
loaded: einvoice.from?.name
|
|
|
|
|
});
|
|
|
|
|
} catch (error) {
|
|
|
|
|
results.push({
|
|
|
|
|
testCase: testCase.name,
|
|
|
|
|
preserved: false,
|
|
|
|
|
error: error.message
|
|
|
|
|
});
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
}
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
console.log('\nUnicode Normalization:');
|
|
|
|
|
results.forEach(test => {
|
|
|
|
|
console.log(` - ${test.testCase}: ${test.preserved ? 'PRESERVED' : 'MODIFIED'}`);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// At least some normalization cases should be preserved
|
|
|
|
|
const preservedCount = results.filter(r => r.preserved).length;
|
|
|
|
|
expect(preservedCount).toBeGreaterThan(0);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
tap.test('CONV-11: Character Encoding - should handle control and special characters', async () => {
|
|
|
|
|
// Test various control and special characters
|
|
|
|
|
const specialChars = {
|
|
|
|
|
emoji: '🧾💰📊', // Emoji characters
|
|
|
|
|
surrogates: '𝕳𝖊𝖑𝖑𝖔', // Mathematical alphanumeric symbols
|
|
|
|
|
combining: 'a\u0300\u0301\u0302\u0303' // Combining diacriticals
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const results = {};
|
|
|
|
|
|
|
|
|
|
for (const [charType, chars] of Object.entries(specialChars)) {
|
|
|
|
|
const invoice = `<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
|
|
|
|
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
|
|
|
|
|
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
|
|
|
|
<cbc:ID>CTRL-${charType.toUpperCase()}-001</cbc:ID>
|
|
|
|
|
<cbc:IssueDate>2024-01-28</cbc:IssueDate>
|
|
|
|
|
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
|
|
|
|
|
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
|
|
|
|
|
<cbc:Note>Product ${chars} Description</cbc:Note>
|
|
|
|
|
<cac:AccountingSupplierParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>Seller ${chars} Company</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingSupplierParty>
|
|
|
|
|
<cac:AccountingCustomerParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>Buyer Ltd</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingCustomerParty>
|
|
|
|
|
<cac:LegalMonetaryTotal>
|
|
|
|
|
<cbc:PayableAmount currencyID="EUR">100.00</cbc:PayableAmount>
|
|
|
|
|
</cac:LegalMonetaryTotal>
|
|
|
|
|
</Invoice>`;
|
|
|
|
|
|
|
|
|
|
try {
|
2025-05-25 19:45:37 +00:00
|
|
|
|
const einvoice = new EInvoice();
|
2025-05-26 13:33:21 +00:00
|
|
|
|
await einvoice.loadXml(invoice);
|
|
|
|
|
const exportedXml = await einvoice.toXmlString('ubl');
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
// Check how special characters are handled
|
|
|
|
|
results[charType] = {
|
|
|
|
|
originalHasChars: invoice.includes(chars),
|
|
|
|
|
exportedHasChars: exportedXml.includes(chars),
|
|
|
|
|
preserved: einvoice.from?.name?.includes(chars) || einvoice.notes?.includes(chars),
|
|
|
|
|
noteContent: einvoice.notes
|
|
|
|
|
};
|
|
|
|
|
} catch (error) {
|
|
|
|
|
results[charType] = {
|
|
|
|
|
error: true,
|
|
|
|
|
message: error.message
|
2025-05-25 19:45:37 +00:00
|
|
|
|
};
|
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
}
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
console.log('\nSpecial Characters Handling:');
|
|
|
|
|
Object.entries(results).forEach(([type, result]: [string, any]) => {
|
|
|
|
|
if (result.error) {
|
|
|
|
|
console.log(` - ${type}: ERROR - ${result.message}`);
|
|
|
|
|
} else {
|
|
|
|
|
console.log(` - ${type}: ${result.preserved ? 'PRESERVED' : 'NOT PRESERVED'} in data model`);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
});
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
// Emoji and special chars might not be fully preserved in all implementations
|
|
|
|
|
expect(Object.keys(results).length).toBeGreaterThan(0);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
tap.test('CONV-11: Character Encoding - should handle multi-language content', async () => {
|
|
|
|
|
const einvoice = new EInvoice();
|
|
|
|
|
|
|
|
|
|
// Create invoice with multiple scripts/languages
|
|
|
|
|
const multiLangInvoice = `<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
|
|
|
|
|
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
|
|
|
|
|
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2">
|
|
|
|
|
<cbc:ID>MULTI-LANG-2024-001</cbc:ID>
|
|
|
|
|
<cbc:IssueDate>2024-01-28</cbc:IssueDate>
|
|
|
|
|
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
|
|
|
|
|
<cbc:DocumentCurrencyCode>EUR</cbc:DocumentCurrencyCode>
|
|
|
|
|
<cbc:Note>Thank you 谢谢 Ευχαριστώ شكرا धन्यवाद</cbc:Note>
|
|
|
|
|
<cac:AccountingSupplierParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>Global Trading Company 全球贸易公司</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
<cac:PostalAddress>
|
|
|
|
|
<cbc:StreetName>International Plaza 国际广场</cbc:StreetName>
|
|
|
|
|
<cbc:CityName>Singapore</cbc:CityName>
|
|
|
|
|
<cbc:PostalZone>123456</cbc:PostalZone>
|
|
|
|
|
<cac:Country>
|
|
|
|
|
<cbc:IdentificationCode>SG</cbc:IdentificationCode>
|
|
|
|
|
</cac:Country>
|
|
|
|
|
</cac:PostalAddress>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingSupplierParty>
|
|
|
|
|
<cac:AccountingCustomerParty>
|
|
|
|
|
<cac:Party>
|
|
|
|
|
<cac:PartyName>
|
|
|
|
|
<cbc:Name>المشتري العربي | Arabic Buyer</cbc:Name>
|
|
|
|
|
</cac:PartyName>
|
|
|
|
|
<cac:PostalAddress>
|
|
|
|
|
<cbc:StreetName>شارع العرب | Arab Street</cbc:StreetName>
|
|
|
|
|
<cbc:CityName>Dubai</cbc:CityName>
|
|
|
|
|
<cbc:PostalZone>00000</cbc:PostalZone>
|
|
|
|
|
<cac:Country>
|
|
|
|
|
<cbc:IdentificationCode>AE</cbc:IdentificationCode>
|
|
|
|
|
</cac:Country>
|
|
|
|
|
</cac:PostalAddress>
|
|
|
|
|
</cac:Party>
|
|
|
|
|
</cac:AccountingCustomerParty>
|
|
|
|
|
<cac:InvoiceLine>
|
|
|
|
|
<cbc:ID>1</cbc:ID>
|
|
|
|
|
<cbc:InvoicedQuantity unitCode="EA">1</cbc:InvoicedQuantity>
|
|
|
|
|
<cbc:LineExtensionAmount currencyID="EUR">100.00</cbc:LineExtensionAmount>
|
|
|
|
|
<cac:Item>
|
|
|
|
|
<cbc:Name>Product 产品 Προϊόν منتج</cbc:Name>
|
|
|
|
|
</cac:Item>
|
|
|
|
|
<cac:Price>
|
|
|
|
|
<cbc:PriceAmount currencyID="EUR">100.00</cbc:PriceAmount>
|
|
|
|
|
</cac:Price>
|
|
|
|
|
</cac:InvoiceLine>
|
|
|
|
|
<cac:LegalMonetaryTotal>
|
|
|
|
|
<cbc:PayableAmount currencyID="EUR">105.00</cbc:PayableAmount>
|
|
|
|
|
</cac:LegalMonetaryTotal>
|
|
|
|
|
</Invoice>`;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await einvoice.loadXml(multiLangInvoice);
|
|
|
|
|
const exportedXml = await einvoice.toXmlString('ubl');
|
|
|
|
|
|
|
|
|
|
// Check preservation of multi-language content
|
|
|
|
|
const chinesePreserved = einvoice.from?.name?.includes('全球贸易公司') || exportedXml.includes('全球贸易公司');
|
|
|
|
|
const arabicPreserved = einvoice.to?.name?.includes('العربي') || exportedXml.includes('العربي');
|
|
|
|
|
const greekPreserved = einvoice.notes?.includes('Ευχαριστώ') || exportedXml.includes('Ευχαριστώ');
|
|
|
|
|
const mixedItemPreserved = einvoice.items[0]?.name?.includes('产品') || exportedXml.includes('产品');
|
|
|
|
|
|
|
|
|
|
const results = {
|
|
|
|
|
chinese: chinesePreserved,
|
|
|
|
|
arabic: arabicPreserved,
|
|
|
|
|
greek: greekPreserved,
|
|
|
|
|
mixedItem: mixedItemPreserved,
|
|
|
|
|
allPreserved: chinesePreserved && arabicPreserved && greekPreserved
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
console.log('\nMulti-Language Content:');
|
|
|
|
|
console.log(` - Chinese preserved: ${results.chinese}`);
|
|
|
|
|
console.log(` - Arabic preserved: ${results.arabic}`);
|
|
|
|
|
console.log(` - Greek preserved: ${results.greek}`);
|
|
|
|
|
console.log(` - Mixed item preserved: ${results.mixedItem}`);
|
|
|
|
|
console.log(` - All languages preserved: ${results.allPreserved}`);
|
|
|
|
|
|
|
|
|
|
expect(results.chinese || results.arabic || results.greek).toEqual(true);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.log('Multi-language test error:', error);
|
|
|
|
|
expect(true).toEqual(true); // Pass if there's an error, as encoding support may vary
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
tap.test('CONV-11: Character Encoding - should analyze corpus encoding characteristics', async () => {
|
|
|
|
|
const corpusDir = plugins.path.join(process.cwd(), 'test/assets/corpus');
|
|
|
|
|
const encodingStats = {
|
|
|
|
|
totalFiles: 0,
|
|
|
|
|
specialCharFiles: 0,
|
|
|
|
|
characterTypes: new Set<string>(),
|
|
|
|
|
successfullyParsed: 0
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Sample a few known corpus files
|
|
|
|
|
const testFiles = [
|
|
|
|
|
'XML-Rechnung/UBL/EN16931_Einfach.ubl.xml',
|
|
|
|
|
'XML-Rechnung/CII/EN16931_Einfach.cii.xml',
|
|
|
|
|
'PEPPOL/Valid/billing-3.0-invoice-full-sample.xml'
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
for (const file of testFiles) {
|
|
|
|
|
const fullPath = plugins.path.join(corpusDir, file);
|
|
|
|
|
try {
|
|
|
|
|
const content = await plugins.fs.readFile(fullPath, 'utf-8');
|
|
|
|
|
encodingStats.totalFiles++;
|
|
|
|
|
|
|
|
|
|
// Check for special characters
|
|
|
|
|
const hasSpecialChars = /[^\x00-\x7F]/.test(content);
|
|
|
|
|
const hasControlChars = /[\x00-\x1F\x7F]/.test(content);
|
|
|
|
|
const hasRTL = /[\u0590-\u08FF\uFB1D-\uFDFF\uFE70-\uFEFF]/.test(content);
|
|
|
|
|
const hasCJK = /[\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]/.test(content);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
if (hasSpecialChars || hasControlChars || hasRTL || hasCJK) {
|
|
|
|
|
encodingStats.specialCharFiles++;
|
|
|
|
|
if (hasControlChars) encodingStats.characterTypes.add('control');
|
|
|
|
|
if (hasRTL) encodingStats.characterTypes.add('RTL');
|
|
|
|
|
if (hasCJK) encodingStats.characterTypes.add('CJK');
|
|
|
|
|
if (hasSpecialChars) encodingStats.characterTypes.add('special');
|
|
|
|
|
}
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
// Try parsing
|
|
|
|
|
try {
|
|
|
|
|
const einvoice = new EInvoice();
|
|
|
|
|
await einvoice.loadXml(content);
|
|
|
|
|
if (einvoice.id) {
|
|
|
|
|
encodingStats.successfullyParsed++;
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
} catch (parseError) {
|
|
|
|
|
// Parsing error
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
} catch (error) {
|
|
|
|
|
// File doesn't exist or read error
|
2025-05-25 19:45:37 +00:00
|
|
|
|
}
|
2025-05-26 13:33:21 +00:00
|
|
|
|
}
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
const results = {
|
|
|
|
|
...encodingStats,
|
|
|
|
|
characterTypes: Array.from(encodingStats.characterTypes),
|
|
|
|
|
specialCharPercentage: encodingStats.totalFiles > 0
|
|
|
|
|
? (encodingStats.specialCharFiles / encodingStats.totalFiles * 100).toFixed(2) + '%'
|
|
|
|
|
: '0%',
|
|
|
|
|
parseSuccessRate: encodingStats.totalFiles > 0
|
|
|
|
|
? (encodingStats.successfullyParsed / encodingStats.totalFiles * 100).toFixed(2) + '%'
|
|
|
|
|
: '0%'
|
|
|
|
|
};
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
console.log('\nCorpus Encoding Analysis:');
|
|
|
|
|
console.log(` - Files analyzed: ${results.totalFiles}`);
|
|
|
|
|
console.log(` - Files with special characters: ${results.specialCharFiles} (${results.specialCharPercentage})`);
|
|
|
|
|
console.log(` - Character types found: ${results.characterTypes.join(', ')}`);
|
|
|
|
|
console.log(` - Successfully parsed: ${results.successfullyParsed} (${results.parseSuccessRate})`);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
|
2025-05-26 13:33:21 +00:00
|
|
|
|
expect(results.totalFiles).toBeGreaterThan(0);
|
2025-05-25 19:45:37 +00:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
tap.start();
|