feat(ZUGFERD): Add dedicated ZUGFERD v1/v2 support and refine invoice format detection logic

2025-04-03 20:08:02 +00:00
parent b4a95de482
commit 46331c2bf6
28 changed files with 1191 additions and 294 deletions
--- a/ts/formats/pdf/extractors/associated.extractor.ts
+++ b/ts/formats/pdf/extractors/associated.extractor.ts
@@ -0,0 +1,78 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Associated files extractor for PDF/A-3 documents
+ * Extracts XML from associated files (AF entry in the catalog)
+ * Particularly useful for ZUGFeRD v1 and some Factur-X documents
+ */
+export class AssociatedFilesExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer using associated files
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+      
+      // Try to find associated files via the AF entry in the catalog
+      const afArray = pdfDoc.catalog.lookup(PDFName.of('AF'));
+      if (!(afArray instanceof PDFArray)) {
+        console.warn('No AF (Associated Files) entry found in PDF catalog');
+        return null;
+      }
+      
+      // Process each associated file
+      for (let i = 0; i < afArray.size(); i++) {
+        const fileSpec = afArray.lookup(i);
+        if (!(fileSpec instanceof PDFDict)) {
+          continue;
+        }
+        
+        // Get the file name
+        const fileNameObj = fileSpec.lookup(PDFName.of('F')) || fileSpec.lookup(PDFName.of('UF'));
+        if (!(fileNameObj instanceof PDFString)) {
+          continue;
+        }
+        
+        const fileName = fileNameObj.decodeText();
+        
+        // Check if it's a known invoice XML file name
+        const isKnownFileName = this.knownFileNames.some(
+          knownName => fileName.toLowerCase() === knownName.toLowerCase()
+        );
+        
+        // Check if it's any XML file or has invoice-related keywords
+        const isXmlFile = fileName.toLowerCase().endsWith('.xml') || 
+                          fileName.toLowerCase().includes('zugferd') ||
+                          fileName.toLowerCase().includes('factur-x') ||
+                          fileName.toLowerCase().includes('xrechnung') ||
+                          fileName.toLowerCase().includes('invoice');
+        
+        if (isKnownFileName || isXmlFile) {
+          // Get the embedded file dictionary
+          const efDict = fileSpec.lookup(PDFName.of('EF'));
+          if (!(efDict instanceof PDFDict)) {
+            continue;
+          }
+          
+          // Get the file stream
+          const fileStream = efDict.lookup(PDFName.of('F'));
+          if (fileStream instanceof PDFRawStream) {
+            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
+            if (xmlContent) {
+              return xmlContent;
+            }
+          }
+        }
+      }
+      
+      console.warn('No valid XML found in associated files');
+      return null;
+    } catch (error) {
+      console.error('Error in associated files extraction:', error);
+      return null;
+    }
+  }
+}
--- a/ts/formats/pdf/extractors/base.extractor.ts
+++ b/ts/formats/pdf/extractors/base.extractor.ts
@@ -0,0 +1,177 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import * as pako from 'pako';
+
+/**
+ * Base class for PDF XML extractors with common functionality
+ */
+export abstract class BaseXMLExtractor {
+  /**
+   * Known XML file names for different invoice formats
+   */
+  protected readonly knownFileNames = [
+    'factur-x.xml',
+    'zugferd-invoice.xml',
+    'ZUGFeRD-invoice.xml',
+    'xrechnung.xml'
+  ];
+
+  /**
+   * Known XML formats to validate extracted content
+   */
+  protected readonly knownFormats = [
+    'CrossIndustryInvoice',
+    'CrossIndustryDocument',
+    'Invoice',
+    'CreditNote',
+    'ubl:Invoice',
+    'ubl:CreditNote',
+    'rsm:CrossIndustryInvoice',
+    'rsm:CrossIndustryDocument',
+    'ram:CrossIndustryDocument',
+    'urn:un:unece:uncefact',
+    'urn:ferd:CrossIndustryDocument',
+    'urn:zugferd',
+    'urn:factur-x',
+    'factur-x.eu',
+    'ZUGFeRD'
+  ];
+
+  /**
+   * Known XML end tags for extracting content from strings
+   */
+  protected readonly knownEndTags = [
+    '</CrossIndustryInvoice>',
+    '</CrossIndustryDocument>',
+    '</Invoice>',
+    '</CreditNote>',
+    '</rsm:CrossIndustryInvoice>',
+    '</rsm:CrossIndustryDocument>',
+    '</ram:CrossIndustryDocument>',
+    '</ubl:Invoice>',
+    '</ubl:CreditNote>'
+  ];
+
+  /**
+   * Extract XML from a PDF buffer
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public abstract extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null>;
+
+  /**
+   * Check if an XML string is valid
+   * @param xmlString XML string to check
+   * @returns True if the XML is valid
+   */
+  protected isValidXml(xmlString: string): boolean {
+    try {
+      // Basic checks for XML validity
+      if (!xmlString || typeof xmlString !== 'string') {
+        return false;
+      }
+
+      // Check if it starts with XML declaration
+      if (!xmlString.includes('<?xml')) {
+        return false;
+      }
+
+      // Check if the XML string contains known invoice formats
+      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
+      if (!hasKnownFormat) {
+        return false;
+      }
+
+      // Check if the XML string contains binary data or invalid characters
+      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
+      if (hasBinaryData) {
+        return false;
+      }
+
+      // Check if the XML string is too short
+      if (xmlString.length < 100) {
+        return false;
+      }
+
+      return true;
+    } catch (error) {
+      console.error('Error validating XML:', error);
+      return false;
+    }
+  }
+
+  /**
+   * Extract XML from a string
+   * @param text Text to extract XML from
+   * @param startIndex Index to start extraction from
+   * @returns XML content or null if not found
+   */
+  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
+    try {
+      // Find the start of the XML document
+      const xmlStartIndex = text.indexOf('<?xml', startIndex);
+      if (xmlStartIndex === -1) {
+        return null;
+      }
+
+      // Try to find the end of the XML document
+      let xmlEndIndex = -1;
+      for (const endTag of this.knownEndTags) {
+        const endIndex = text.indexOf(endTag, xmlStartIndex);
+        if (endIndex !== -1) {
+          xmlEndIndex = endIndex + endTag.length;
+          break;
+        }
+      }
+
+      if (xmlEndIndex === -1) {
+        return null;
+      }
+
+      // Extract the XML content
+      return text.substring(xmlStartIndex, xmlEndIndex);
+    } catch (error) {
+      console.error('Error extracting XML from string:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Decompress and decode XML content from a PDF stream
+   * @param stream PDF stream containing XML data
+   * @param fileName Name of the file (for logging)
+   * @returns XML content or null if not valid
+   */
+  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
+    try {
+      // Try to decompress with pako
+      const compressedBytes = stream.getContents().buffer;
+      try {
+        const decompressedBytes = pako.inflate(compressedBytes);
+        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
+        
+        if (this.isValidXml(xmlContent)) {
+          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
+          return xmlContent;
+        }
+      } catch (decompressError) {
+        // Decompression failed, try without decompression
+        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
+      }
+      
+      // Try without decompression
+      const rawBytes = stream.getContents();
+      const rawContent = new TextDecoder('utf-8').decode(rawBytes);
+      
+      if (this.isValidXml(rawContent)) {
+        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
+        return rawContent;
+      }
+      
+      return null;
+    } catch (error) {
+      console.error('Error extracting XML from stream:', error);
+      return null;
+    }
+  }
+}
--- a/ts/formats/pdf/extractors/index.ts
+++ b/ts/formats/pdf/extractors/index.ts
@@ -0,0 +1,4 @@
+export * from './base.extractor.js';
+export * from './standard.extractor.js';
+export * from './associated.extractor.js';
+export * from './text.extractor.js';
--- a/ts/formats/pdf/extractors/standard.extractor.ts
+++ b/ts/formats/pdf/extractors/standard.extractor.ts
@@ -0,0 +1,86 @@
+import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Standard PDF XML extractor that extracts XML from embedded files
+ * Works with PDF/A-3 documents that follow the standard for embedding files
+ */
+export class StandardXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer using standard PDF/A-3 embedded files
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      const pdfDoc = await PDFDocument.load(pdfBuffer);
+
+      // Get the document's metadata dictionary
+      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
+      if (!(namesDictObj instanceof PDFDict)) {
+        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the embedded files dictionary
+      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
+      if (!(embeddedFilesDictObj instanceof PDFDict)) {
+        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
+        return null;
+      }
+
+      // Get the names array
+      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
+      if (!(filesSpecObj instanceof PDFArray)) {
+        console.warn('No files specified in EmbeddedFiles dictionary!');
+        return null;
+      }
+
+      // Try to find an XML file in the embedded files
+      for (let i = 0; i < filesSpecObj.size(); i += 2) {
+        const fileNameObj = filesSpecObj.lookup(i);
+        const fileSpecObj = filesSpecObj.lookup(i + 1);
+
+        if (!(fileNameObj instanceof PDFString) || !(fileSpecObj instanceof PDFDict)) {
+          continue;
+        }
+
+        // Get the filename as string
+        const fileName = fileNameObj.decodeText();
+        
+        // Check if it's a known invoice XML file name
+        const isKnownFileName = this.knownFileNames.some(
+          knownName => fileName.toLowerCase() === knownName.toLowerCase()
+        );
+        
+        // Check if it's any XML file or has invoice-related keywords
+        const isXmlFile = fileName.toLowerCase().endsWith('.xml') || 
+                          fileName.toLowerCase().includes('zugferd') ||
+                          fileName.toLowerCase().includes('factur-x') ||
+                          fileName.toLowerCase().includes('xrechnung') ||
+                          fileName.toLowerCase().includes('invoice');
+        
+        if (isKnownFileName || isXmlFile) {
+          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
+          if (!(efDictObj instanceof PDFDict)) {
+            continue;
+          }
+
+          const fileStream = efDictObj.lookup(PDFName.of('F'));
+          if (fileStream instanceof PDFRawStream) {
+            const xmlContent = await this.extractXmlFromStream(fileStream, fileName);
+            if (xmlContent) {
+              return xmlContent;
+            }
+          }
+        }
+      }
+
+      console.warn('No valid XML found in embedded files');
+      return null;
+    } catch (error) {
+      console.error('Error in standard extraction:', error);
+      return null;
+    }
+  }
+}
--- a/ts/formats/pdf/extractors/text.extractor.ts
+++ b/ts/formats/pdf/extractors/text.extractor.ts
@@ -0,0 +1,55 @@
+import { BaseXMLExtractor } from './base.extractor.js';
+
+/**
+ * Text-based XML extractor for PDF documents
+ * Extracts XML by searching for XML patterns in the PDF text
+ * Used as a fallback when other extraction methods fail
+ */
+export class TextXMLExtractor extends BaseXMLExtractor {
+  /**
+   * Extract XML from a PDF buffer by searching for XML patterns in the text
+   * @param pdfBuffer PDF buffer
+   * @returns XML content or null if not found
+   */
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+    try {
+      // Convert buffer to string and look for XML patterns
+      // Increase the search range to handle larger PDFs
+      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
+
+      // Look for common XML patterns in the PDF
+      const xmlPatterns = [
+        /<\?xml[^>]*\?>/i,
+        /<CrossIndustryInvoice[^>]*>/i,
+        /<CrossIndustryDocument[^>]*>/i,
+        /<Invoice[^>]*>/i,
+        /<CreditNote[^>]*>/i,
+        /<rsm:CrossIndustryInvoice[^>]*>/i,
+        /<rsm:CrossIndustryDocument[^>]*>/i,
+        /<ram:CrossIndustryDocument[^>]*>/i,
+        /<ubl:Invoice[^>]*>/i,
+        /<ubl:CreditNote[^>]*>/i
+      ];
+
+      for (const pattern of xmlPatterns) {
+        const match = pdfString.match(pattern);
+        if (match && match.index !== undefined) {
+          console.log(`Found XML pattern in PDF: ${match[0]}`);
+          
+          // Try to extract the XML content
+          const xmlContent = this.extractXmlFromString(pdfString, match.index);
+          if (xmlContent && this.isValidXml(xmlContent)) {
+            console.log('Successfully extracted XML from PDF text');
+            return xmlContent;
+          }
+        }
+      }
+
+      console.warn('No valid XML found in PDF text');
+      return null;
+    } catch (error) {
+      console.error('Error in text-based extraction:', error);
+      return null;
+    }
+  }
+}
--- a/ts/formats/pdf/pdf.extractor.ts
+++ b/ts/formats/pdf/pdf.extractor.ts
@@ -1,30 +1,54 @@
-import { PDFDocument, PDFDict, PDFName, PDFRawStream, PDFArray, PDFString } from 'pdf-lib';
-import * as pako from 'pako';
+import {
+  BaseXMLExtractor,
+  StandardXMLExtractor,
+  AssociatedFilesExtractor,
+  TextXMLExtractor
+} from './extractors/index.js';

 /**
- * Class for extracting XML from PDF files
+ * Main PDF extractor class that orchestrates the extraction process
+ * Uses multiple specialized extractors in sequence to maximize success rate
 */
 export class PDFExtractor {
+  private extractors: BaseXMLExtractor[] = [];
+
  /**
-   * Extracts XML from a PDF buffer
+   * Constructor initializes the chain of extractors
+   */
+  constructor() {
+    // Add extractors in order of preference/likelihood of success
+    this.extractors.push(
+      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
+      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
+      new TextXMLExtractor()          // Text-based extraction (fallback)
+    );
+  }
+
+  /**
+   * Extract XML from a PDF buffer
+   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
-      // First try the standard extraction
-      const standardXml = await this.standardExtraction(pdfBuffer);
-      if (standardXml && this.isValidXml(standardXml)) {
-        return standardXml;
+      console.log('Starting XML extraction from PDF...');
+
+      // Try each extractor in sequence
+      for (const extractor of this.extractors) {
+        const extractorName = extractor.constructor.name;
+        console.log(`Trying extraction with ${extractorName}...`);
+
+        const xml = await extractor.extractXml(pdfBuffer);
+        if (xml) {
+          console.log(`Successfully extracted XML using ${extractorName}`);
+          return xml;
+        }
+
+        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

-      // If standard extraction fails, try alternative methods
-      const alternativeXml = await this.alternativeExtraction(pdfBuffer);
-      if (alternativeXml && this.isValidXml(alternativeXml)) {
-        return alternativeXml;
-      }
-
-      // If all extraction methods fail, return null
+      // If all extractors fail, return null
      console.warn('All extraction methods failed, no valid XML found in PDF');
      return null;
    } catch (error) {
@@ -33,255 +57,7 @@ export class PDFExtractor {
    }
  }

-  /**
-   * Standard extraction method using PDF-lib
-   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
-   */
-  private async standardExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
-    try {
-      const pdfDoc = await PDFDocument.load(pdfBuffer);

-      // Get the document's metadata dictionary
-      const namesDictObj = pdfDoc.catalog.lookup(PDFName.of('Names'));
-      if (!(namesDictObj instanceof PDFDict)) {
-        console.warn('No Names dictionary found in PDF! This PDF does not contain embedded files.');
-        return null;
-      }

-      const embeddedFilesDictObj = namesDictObj.lookup(PDFName.of('EmbeddedFiles'));
-      if (!(embeddedFilesDictObj instanceof PDFDict)) {
-        console.warn('No EmbeddedFiles dictionary found! This PDF does not contain embedded files.');
-        return null;
-      }

-      const filesSpecObj = embeddedFilesDictObj.lookup(PDFName.of('Names'));
-      if (!(filesSpecObj instanceof PDFArray)) {
-        console.warn('No files specified in EmbeddedFiles dictionary!');
-        return null;
-      }
-
-      // Try to find an XML file in the embedded files
-      let xmlFile: PDFRawStream | undefined;
-      let xmlFileName: string | undefined;
-
-      for (let i = 0; i < filesSpecObj.size(); i += 2) {
-        const fileNameObj = filesSpecObj.lookup(i);
-        const fileSpecObj = filesSpecObj.lookup(i + 1);
-
-        if (!(fileNameObj instanceof PDFString)) {
-          continue;
-        }
-        if (!(fileSpecObj instanceof PDFDict)) {
-          continue;
-        }
-
-        // Get the filename as string
-        const fileName = fileNameObj.toString();
-
-        // Check if it's an XML file (checking both extension and known standard filenames)
-        if (fileName.toLowerCase().includes('.xml') ||
-            fileName.toLowerCase().includes('factur-x') ||
-            fileName.toLowerCase().includes('zugferd') ||
-            fileName.toLowerCase().includes('xrechnung')) {
-
-          const efDictObj = fileSpecObj.lookup(PDFName.of('EF'));
-          if (!(efDictObj instanceof PDFDict)) {
-            continue;
-          }
-
-          const maybeStream = efDictObj.lookup(PDFName.of('F'));
-          if (maybeStream instanceof PDFRawStream) {
-            // Found an XML file - save it
-            xmlFile = maybeStream;
-            xmlFileName = fileName;
-            break;
-          }
-        }
-      }
-
-      // If no XML file was found, return null
-      if (!xmlFile) {
-        console.warn('No embedded XML file found in the PDF!');
-        return null;
-      }
-
-      // Decompress and decode the XML content
-      try {
-        // Try to decompress with pako
-        const xmlCompressedBytes = xmlFile.getContents().buffer;
-        const xmlBytes = pako.inflate(xmlCompressedBytes);
-        const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
-
-        // Check if the XML content is valid
-        if (this.isValidXml(xmlContent)) {
-          console.log(`Successfully extracted XML from PDF file. File name: ${xmlFileName}`);
-          return xmlContent;
-        }
-
-        // If we get here, the XML content is not valid, try without decompression
-        console.log('Decompression succeeded but XML is not valid, trying without decompression...');
-        const rawXmlBytes = xmlFile.getContents();
-        const rawXmlContent = new TextDecoder('utf-8').decode(rawXmlBytes);
-
-        if (this.isValidXml(rawXmlContent)) {
-          console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
-          return rawXmlContent;
-        }
-
-        // If we get here, neither the decompressed nor the raw XML content is valid
-        console.log('Neither decompressed nor raw XML content is valid');
-        return null;
-      } catch (decompressError) {
-        // Decompression failed, try without decompression
-        console.log('Decompression failed, trying without decompression...');
-        try {
-          const xmlBytes = xmlFile.getContents();
-          const xmlContent = new TextDecoder('utf-8').decode(xmlBytes);
-
-          if (this.isValidXml(xmlContent)) {
-            console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${xmlFileName}`);
-            return xmlContent;
-          }
-
-          // If we get here, the XML content is not valid
-          console.log('Uncompressed XML content is not valid');
-          return null;
-        } catch (decodeError) {
-          console.error('Error decoding XML content:', decodeError);
-          return null;
-        }
-      }
-    } catch (error) {
-      console.error('Error in standard extraction:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Alternative extraction method using string search
-   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
-   */
-  private async alternativeExtraction(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
-    try {
-      // Convert buffer to string and look for XML patterns
-      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 10000));
-
-      // Look for common XML patterns in the PDF
-      const xmlPatterns = [
-        /<\?xml[^>]*\?>/i,
-        /<CrossIndustryInvoice[^>]*>/i,
-        /<Invoice[^>]*>/i,
-        /<CreditNote[^>]*>/i,
-        /<rsm:CrossIndustryInvoice[^>]*>/i
-      ];
-
-      for (const pattern of xmlPatterns) {
-        const match = pdfString.match(pattern);
-        if (match) {
-          console.log(`Found XML pattern in PDF: ${match[0]}`);
-
-          // Try to extract the XML content
-          const xmlContent = this.extractXmlFromString(pdfString);
-          if (xmlContent) {
-            console.log('Successfully extracted XML from PDF string');
-            return xmlContent;
-          }
-        }
-      }
-
-      return null;
-    } catch (error) {
-      console.error('Error in alternative extraction:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Extracts XML from a string
-   * @param pdfString PDF string
-   * @returns XML content or null if not found
-   */
-  private extractXmlFromString(pdfString: string): string | null {
-    try {
-      // Look for XML start and end tags
-      const xmlStartIndex = pdfString.indexOf('<?xml');
-      if (xmlStartIndex === -1) {
-        return null;
-      }
-
-      // Try to find the end of the XML document
-      const possibleEndTags = [
-        '</CrossIndustryInvoice>',
-        '</Invoice>',
-        '</CreditNote>',
-        '</rsm:CrossIndustryInvoice>'
-      ];
-
-      let xmlEndIndex = -1;
-      for (const endTag of possibleEndTags) {
-        const endIndex = pdfString.indexOf(endTag);
-        if (endIndex !== -1) {
-          xmlEndIndex = endIndex + endTag.length;
-          break;
-        }
-      }
-
-      if (xmlEndIndex === -1) {
-        return null;
-      }
-
-      // Extract the XML content
-      return pdfString.substring(xmlStartIndex, xmlEndIndex);
-    } catch (error) {
-      console.error('Error extracting XML from string:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Checks if an XML string is valid
-   * @param xmlString XML string to check
-   * @returns True if the XML is valid
-   */
-  private isValidXml(xmlString: string): boolean {
-    try {
-      // Check if the XML string contains basic XML structure
-      if (!xmlString.includes('<?xml')) {
-        return false;
-      }
-
-      // Check if the XML string contains known invoice formats
-      const knownFormats = [
-        'CrossIndustryInvoice',
-        'Invoice',
-        'CreditNote',
-        'ubl:Invoice',
-        'ubl:CreditNote'
-      ];
-
-      const hasKnownFormat = knownFormats.some(format => xmlString.includes(format));
-      if (!hasKnownFormat) {
-        return false;
-      }
-
-      // Check if the XML string contains binary data or invalid characters
-      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
-      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
-      if (hasBinaryData) {
-        return false;
-      }
-
-      // Check if the XML string is too short
-      if (xmlString.length < 100) {
-        return false;
-      }
-
-      return true;
-    } catch (error) {
-      console.error('Error validating XML:', error);
-      return false;
-    }
-  }
 }