fix(core): Improve PDF XML extraction, embedding, and format detection; update loadPdf/exportPdf error handling; add new validator implementations and enhance IPdf metadata.

2025-04-04 12:14:41 +00:00
parent 68fd50fd4c
commit 5d43c1ce4e
15 changed files with 1957 additions and 418 deletions
--- a/ts/formats/pdf/extractors/base.extractor.ts
+++ b/ts/formats/pdf/extractors/base.extractor.ts
@@ -11,7 +11,10 @@ export abstract class BaseXMLExtractor {
    'factur-x.xml',
    'zugferd-invoice.xml',
    'ZUGFeRD-invoice.xml',
-    'xrechnung.xml'
+    'xrechnung.xml',
+    'ubl-invoice.xml',
+    'invoice.xml',
+    'metadata.xml'
  ];

  /**
@@ -32,7 +35,8 @@ export abstract class BaseXMLExtractor {
    'urn:zugferd',
    'urn:factur-x',
    'factur-x.eu',
-    'ZUGFeRD'
+    'ZUGFeRD',
+    'FatturaElettronica'
  ];

  /**
@@ -47,7 +51,8 @@ export abstract class BaseXMLExtractor {
    '</rsm:CrossIndustryDocument>',
    '</ram:CrossIndustryDocument>',
    '</ubl:Invoice>',
-    '</ubl:CreditNote>'
+    '</ubl:CreditNote>',
+    '</FatturaElettronica>'
  ];

  /**
@@ -69,21 +74,19 @@ export abstract class BaseXMLExtractor {
        return false;
      }

-      // Check if it starts with XML declaration
-      if (!xmlString.includes('<?xml')) {
+      // Check if it starts with XML declaration or a valid element
+      if (!xmlString.includes('<?xml') && !this.hasKnownXmlElement(xmlString)) {
        return false;
      }

      // Check if the XML string contains known invoice formats
-      const hasKnownFormat = this.knownFormats.some(format => xmlString.includes(format));
+      const hasKnownFormat = this.hasKnownFormat(xmlString);
      if (!hasKnownFormat) {
        return false;
      }

      // Check if the XML string contains binary data or invalid characters
-      const invalidChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
-      const hasBinaryData = invalidChars.some(char => xmlString.includes(char));
-      if (hasBinaryData) {
+      if (this.hasBinaryData(xmlString)) {
        return false;
      }

@@ -92,6 +95,11 @@ export abstract class BaseXMLExtractor {
        return false;
      }

+      // Check if XML has a proper structure (contains both opening and closing tags)
+      if (!this.hasProperXmlStructure(xmlString)) {
+        return false;
+      }
+
      return true;
    } catch (error) {
      console.error('Error validating XML:', error);
@@ -99,6 +107,85 @@ export abstract class BaseXMLExtractor {
    }
  }

+  /**
+   * Check if the XML string contains a known element
+   * @param xmlString XML string to check
+   * @returns True if the XML contains a known element
+   */
+  protected hasKnownXmlElement(xmlString: string): boolean {
+    for (const format of this.knownFormats) {
+      // Check for opening tag of format
+      if (xmlString.includes(`<${format}`)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Check if the XML string contains a known format
+   * @param xmlString XML string to check
+   * @returns True if the XML contains a known format
+   */
+  protected hasKnownFormat(xmlString: string): boolean {
+    for (const format of this.knownFormats) {
+      if (xmlString.includes(format)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Check if the XML string has a proper structure
+   * @param xmlString XML string to check
+   * @returns True if the XML has a proper structure
+   */
+  protected hasProperXmlStructure(xmlString: string): boolean {
+    // Check for at least one matching opening and closing tag
+    for (const endTag of this.knownEndTags) {
+      const startTag = endTag.replace('/', '');
+      if (xmlString.includes(startTag) && xmlString.includes(endTag)) {
+        return true;
+      }
+    }
+    
+    // If no specific tag is found but it has a basic XML structure
+    return (
+      (xmlString.includes('<?xml') && xmlString.includes('?>')) ||
+      (xmlString.match(/<[^>]+>/) !== null && xmlString.match(/<\/[^>]+>/) !== null)
+    );
+  }
+
+  /**
+   * Check if the XML string contains binary data
+   * @param xmlString XML string to check
+   * @returns True if the XML contains binary data
+   */
+  protected hasBinaryData(xmlString: string): boolean {
+    // Check for common binary data indicators
+    const binaryChars = ['\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005'];
+    const consecutiveNulls = '\u0000\u0000\u0000';
+    
+    // Check for control characters that shouldn't be in XML
+    if (binaryChars.some(char => xmlString.includes(char))) {
+      return true;
+    }
+    
+    // Check for consecutive null bytes which indicate binary data
+    if (xmlString.includes(consecutiveNulls)) {
+      return true;
+    }
+    
+    // Check for high concentration of non-printable characters
+    const nonPrintableCount = (xmlString.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || []).length;
+    if (nonPrintableCount > xmlString.length * 0.05) { // More than 5% non-printable
+      return true;
+    }
+    
+    return false;
+  }
+
  /**
   * Extract XML from a string
   * @param text Text to extract XML from
@@ -108,9 +195,22 @@ export abstract class BaseXMLExtractor {
  protected extractXmlFromString(text: string, startIndex: number = 0): string | null {
    try {
      // Find the start of the XML document
-      const xmlStartIndex = text.indexOf('<?xml', startIndex);
+      let xmlStartIndex = text.indexOf('<?xml', startIndex);
+      
+      // If no XML declaration, try to find known elements
      if (xmlStartIndex === -1) {
-        return null;
+        for (const format of this.knownFormats) {
+          const formatStartIndex = text.indexOf(`<${format.split(':').pop()}`, startIndex);
+          if (formatStartIndex !== -1) {
+            xmlStartIndex = formatStartIndex;
+            break;
+          }
+        }
+        
+        // Still didn't find any start marker
+        if (xmlStartIndex === -1) {
+          return null;
+        }
      }

      // Try to find the end of the XML document
@@ -123,12 +223,26 @@ export abstract class BaseXMLExtractor {
        }
      }

+      // If no known end tag found, try to use a heuristic approach
      if (xmlEndIndex === -1) {
-        return null;
+        // Try to find the last closing tag
+        const lastClosingTagMatch = text.slice(xmlStartIndex).match(/<\/[^>]+>(?!.*<\/[^>]+>)/);
+        if (lastClosingTagMatch && lastClosingTagMatch.index !== undefined) {
+          xmlEndIndex = xmlStartIndex + lastClosingTagMatch.index + lastClosingTagMatch[0].length;
+        } else {
+          return null;
+        }
      }

      // Extract the XML content
-      return text.substring(xmlStartIndex, xmlEndIndex);
+      const xmlContent = text.substring(xmlStartIndex, xmlEndIndex);
+      
+      // Validate the extracted content
+      if (this.isValidXml(xmlContent)) {
+        return xmlContent;
+      }
+      
+      return null;
    } catch (error) {
      console.error('Error extracting XML from string:', error);
      return null;
@@ -143,34 +257,99 @@ export abstract class BaseXMLExtractor {
   */
  protected async extractXmlFromStream(stream: PDFRawStream, fileName: string): Promise<string | null> {
    try {
-      // Try to decompress with pako
-      const compressedBytes = stream.getContents().buffer;
+      // Get the raw bytes from the stream
+      const rawBytes = stream.getContents();
+      
+      // First try without decompression (in case the content is not compressed)
+      let xmlContent = this.tryDecodeBuffer(rawBytes);
+      if (xmlContent && this.isValidXml(xmlContent)) {
+        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
+        return xmlContent;
+      }
+      
+      // Try with decompression
      try {
-        const decompressedBytes = pako.inflate(compressedBytes);
-        const xmlContent = new TextDecoder('utf-8').decode(decompressedBytes);
-
-        if (this.isValidXml(xmlContent)) {
-          console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
-          return xmlContent;
+        const decompressedBytes = this.tryDecompress(rawBytes);
+        if (decompressedBytes) {
+          xmlContent = this.tryDecodeBuffer(decompressedBytes);
+          if (xmlContent && this.isValidXml(xmlContent)) {
+            console.log(`Successfully extracted decompressed XML from PDF file. File name: ${fileName}`);
+            return xmlContent;
+          }
        }
      } catch (decompressError) {
-        // Decompression failed, try without decompression
-        console.log(`Decompression failed for ${fileName}, trying without decompression...`);
+        console.log(`Decompression failed for ${fileName}: ${decompressError}`);
      }
-
-      // Try without decompression
-      const rawBytes = stream.getContents();
-      const rawContent = new TextDecoder('utf-8').decode(rawBytes);
-
-      if (this.isValidXml(rawContent)) {
-        console.log(`Successfully extracted uncompressed XML from PDF file. File name: ${fileName}`);
-        return rawContent;
-      }
-
+      
      return null;
    } catch (error) {
      console.error('Error extracting XML from stream:', error);
      return null;
    }
  }
-}
+  
+  /**
+   * Try to decompress a buffer using different methods
+   * @param buffer Buffer to decompress
+   * @returns Decompressed buffer or null if decompression failed
+   */
+  protected tryDecompress(buffer: Uint8Array): Uint8Array | null {
+    try {
+      // Try pako inflate (for deflate/zlib compression)
+      return pako.inflate(buffer);
+    } catch (error) {
+      // If pako fails, try other methods if needed
+      console.warn('Pako decompression failed, might be uncompressed or using a different algorithm');
+      return null;
+    }
+  }
+  
+  /**
+   * Try to decode a buffer to a string using different encodings
+   * @param buffer Buffer to decode
+   * @returns Decoded string or null if decoding failed
+   */
+  protected tryDecodeBuffer(buffer: Uint8Array): string | null {
+    try {
+      // Try UTF-8 first
+      let content = new TextDecoder('utf-8').decode(buffer);
+      if (this.isPlausibleXml(content)) {
+        return content;
+      }
+      
+      // Try ISO-8859-1 (Latin1)
+      content = this.decodeLatin1(buffer);
+      if (this.isPlausibleXml(content)) {
+        return content;
+      }
+      
+      return null;
+    } catch (error) {
+      console.warn('Error decoding buffer:', error);
+      return null;
+    }
+  }
+  
+  /**
+   * Decode a buffer using ISO-8859-1 (Latin1) encoding
+   * @param buffer Buffer to decode
+   * @returns Decoded string
+   */
+  protected decodeLatin1(buffer: Uint8Array): string {
+    return Array.from(buffer)
+      .map(byte => String.fromCharCode(byte))
+      .join('');
+  }
+  
+  /**
+   * Check if a string is plausibly XML (quick check before validation)
+   * @param content String to check
+   * @returns True if the string is plausibly XML
+   */
+  protected isPlausibleXml(content: string): boolean {
+    return content.includes('<') && 
+           content.includes('>') && 
+           (content.includes('<?xml') || 
+            this.knownFormats.some(format => content.includes(format)));
+  }
+}
--- a/ts/formats/pdf/extractors/text.extractor.ts
+++ b/ts/formats/pdf/extractors/text.extractor.ts
@@ -6,50 +6,157 @@ import { BaseXMLExtractor } from './base.extractor.js';
 * Used as a fallback when other extraction methods fail
 */
 export class TextXMLExtractor extends BaseXMLExtractor {
+  // Maximum chunk size to process at once (4MB)
+  private readonly CHUNK_SIZE = 4 * 1024 * 1024;
+  
+  // Maximum number of chunks to check (effective 20MB search limit)
+  private readonly MAX_CHUNKS = 5;
+  
+  // Common XML patterns to look for
+  private readonly XML_PATTERNS = [
+    '<?xml', 
+    '<CrossIndustryInvoice',
+    '<CrossIndustryDocument',
+    '<Invoice',
+    '<CreditNote',
+    '<rsm:CrossIndustryInvoice',
+    '<rsm:CrossIndustryDocument',
+    '<ram:CrossIndustryDocument',
+    '<ubl:Invoice',
+    '<ubl:CreditNote',
+    '<FatturaElettronica'
+  ];
+
  /**
   * Extract XML from a PDF buffer by searching for XML patterns in the text
+   * Uses a chunked approach to handle large files efficiently
   * @param pdfBuffer PDF buffer
   * @returns XML content or null if not found
   */
  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
    try {
-      // Convert buffer to string and look for XML patterns
-      // Increase the search range to handle larger PDFs
-      const pdfString = Buffer.from(pdfBuffer).toString('utf8', 0, Math.min(pdfBuffer.length, 50000));
-
-      // Look for common XML patterns in the PDF
-      const xmlPatterns = [
-        /<\?xml[^>]*\?>/i,
-        /<CrossIndustryInvoice[^>]*>/i,
-        /<CrossIndustryDocument[^>]*>/i,
-        /<Invoice[^>]*>/i,
-        /<CreditNote[^>]*>/i,
-        /<rsm:CrossIndustryInvoice[^>]*>/i,
-        /<rsm:CrossIndustryDocument[^>]*>/i,
-        /<ram:CrossIndustryDocument[^>]*>/i,
-        /<ubl:Invoice[^>]*>/i,
-        /<ubl:CreditNote[^>]*>/i
-      ];
-
-      for (const pattern of xmlPatterns) {
-        const match = pdfString.match(pattern);
-        if (match && match.index !== undefined) {
-          console.log(`Found XML pattern in PDF: ${match[0]}`);
-          
-          // Try to extract the XML content
-          const xmlContent = this.extractXmlFromString(pdfString, match.index);
-          if (xmlContent && this.isValidXml(xmlContent)) {
-            console.log('Successfully extracted XML from PDF text');
-            return xmlContent;
-          }
-        }
-      }
-
-      console.warn('No valid XML found in PDF text');
-      return null;
+      console.log('Attempting text-based XML extraction from PDF...');
+      
+      // Convert Buffer to Uint8Array if needed
+      const buffer = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
+      
+      // Try extracting XML using the chunked approach
+      return this.extractXmlFromBufferChunked(buffer);
    } catch (error) {
      console.error('Error in text-based extraction:', error);
      return null;
    }
  }
-}
+
+  /**
+   * Extract XML from buffer using a chunked approach
+   * This helps avoid memory issues with large PDFs
+   * @param buffer Buffer to search in
+   * @returns XML content or null if not found
+   */
+  private extractXmlFromBufferChunked(buffer: Uint8Array): string | null {
+    // Process the PDF in chunks
+    for (let chunkIndex = 0; chunkIndex < this.MAX_CHUNKS; chunkIndex++) {
+      const startPos = chunkIndex * this.CHUNK_SIZE;
+      if (startPos >= buffer.length) break;
+      
+      const endPos = Math.min(startPos + this.CHUNK_SIZE, buffer.length);
+      const chunk = buffer.slice(startPos, endPos);
+      
+      // Try to extract XML from this chunk
+      const chunkResult = this.processChunk(chunk, startPos);
+      if (chunkResult) {
+        return chunkResult;
+      }
+    }
+    
+    console.warn('No valid XML found in any chunk of the PDF');
+    return null;
+  }
+
+  /**
+   * Process a single chunk of the PDF buffer
+   * @param chunk Chunk buffer to process
+   * @param chunkOffset Offset position of the chunk in the original buffer
+   * @returns XML content or null if not found
+   */
+  private processChunk(chunk: Uint8Array, chunkOffset: number): string | null {
+    try {
+      // First try UTF-8 encoding for this chunk
+      const utf8String = this.decodeBufferToString(chunk, 'utf-8');
+      let xmlContent = this.searchForXmlInString(utf8String);
+      
+      if (xmlContent) {
+        console.log(`Found XML content in chunk at offset ${chunkOffset} using UTF-8 encoding`);
+        return xmlContent;
+      }
+      
+      // If UTF-8 fails, try Latin-1 (ISO-8859-1) which can handle binary better
+      const latin1String = this.decodeBufferToString(chunk, 'latin1');
+      xmlContent = this.searchForXmlInString(latin1String);
+      
+      if (xmlContent) {
+        console.log(`Found XML content in chunk at offset ${chunkOffset} using Latin-1 encoding`);
+        return xmlContent;
+      }
+      
+      // No XML found in this chunk
+      return null;
+    } catch (error) {
+      console.warn(`Error processing chunk at offset ${chunkOffset}:`, error);
+      return null;
+    }
+  }
+
+  /**
+   * Safely decode a buffer to string using the specified encoding
+   * @param buffer Buffer to decode
+   * @param encoding Encoding to use ('utf-8' or 'latin1')
+   * @returns Decoded string
+   */
+  private decodeBufferToString(buffer: Uint8Array, encoding: 'utf-8' | 'latin1'): string {
+    try {
+      if (encoding === 'utf-8') {
+        return new TextDecoder('utf-8', { fatal: false }).decode(buffer);
+      } else {
+        // For Latin-1 we can use a direct mapping (bytes 0-255 map directly to code points 0-255)
+        // This is more reliable for binary data than TextDecoder for legacy encodings
+        return Array.from(buffer)
+          .map(byte => String.fromCharCode(byte))
+          .join('');
+      }
+    } catch (error) {
+      console.warn(`Error decoding buffer using ${encoding}:`, error);
+      // Return empty string on error to allow processing to continue
+      return '';
+    }
+  }
+
+  /**
+   * Search for XML patterns in a string
+   * @param content String to search in
+   * @returns XML content or null if not found
+   */
+  private searchForXmlInString(content: string): string | null {
+    if (!content) return null;
+    
+    // Search for each XML pattern
+    for (const pattern of this.XML_PATTERNS) {
+      const patternIndex = content.indexOf(pattern);
+      if (patternIndex !== -1) {
+        console.log(`Found XML pattern "${pattern}" at position ${patternIndex}`);
+        
+        // Try to extract the XML content starting from the pattern position
+        const xmlContent = this.extractXmlFromString(content, patternIndex);
+        
+        // Validate the extracted content
+        if (xmlContent && this.isValidXml(xmlContent)) {
+          console.log('Successfully extracted and validated XML from text');
+          return xmlContent;
+        }
+      }
+    }
+    
+    return null;
+  }
+}
--- a/ts/formats/pdf/pdf.embedder.ts
+++ b/ts/formats/pdf/pdf.embedder.ts
@@ -1,8 +1,33 @@
 import { PDFDocument, AFRelationship } from '../../plugins.js';
 import type { IPdf } from '../../interfaces/common.js';

+/**
+ * Error types for PDF embedding operations
+ */
+export enum PDFEmbedError {
+  LOAD_ERROR = 'PDF loading failed',
+  EMBED_ERROR = 'XML embedding failed',
+  SAVE_ERROR = 'PDF saving failed',
+  INVALID_INPUT = 'Invalid input parameters'
+}
+
+/**
+ * Result of a PDF embedding operation
+ */
+export interface PDFEmbedResult {
+  success: boolean;
+  data?: Uint8Array;
+  pdf?: IPdf;
+  error?: {
+    type: PDFEmbedError;
+    message: string;
+    originalError?: Error;
+  };
+}
+
 /**
 * Class for embedding XML into PDF files
+ * Provides robust error handling and support for different PDF formats
 */
 export class PDFEmbedder {
  /**
@@ -11,40 +36,92 @@ export class PDFEmbedder {
   * @param xmlContent XML content to embed
   * @param filename Filename for the embedded XML
   * @param description Description for the embedded XML
-   * @returns Modified PDF buffer
+   * @returns Result with either modified PDF buffer or error information
   */
  public async embedXml(
    pdfBuffer: Uint8Array | Buffer,
    xmlContent: string,
    filename: string = 'invoice.xml',
    description: string = 'XML Invoice'
-  ): Promise<Uint8Array> {
+  ): Promise<PDFEmbedResult> {
    try {
+      // Validate inputs
+      if (!pdfBuffer || pdfBuffer.length === 0) {
+        return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'PDF buffer is empty or undefined');
+      }
+      
+      if (!xmlContent) {
+        return this.createErrorResult(PDFEmbedError.INVALID_INPUT, 'XML content is empty or undefined');
+      }
+
+      // Ensure buffer is Uint8Array
+      const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
+
      // Load the PDF
-      const pdfDoc = await PDFDocument.load(pdfBuffer);
+      let pdfDoc: PDFDocument;
+      try {
+        pdfDoc = await PDFDocument.load(pdfBufferArray, {
+          ignoreEncryption: true,  // Try to load encrypted PDFs
+          updateMetadata: false    // Don't automatically update metadata
+        });
+      } catch (error) {
+        return this.createErrorResult(
+          PDFEmbedError.LOAD_ERROR,
+          `Failed to load PDF: ${error instanceof Error ? error.message : String(error)}`,
+          error instanceof Error ? error : undefined
+        );
+      }
+
+      // Normalize filename (lowercase with XML extension)
+      filename = this.normalizeFilename(filename);

      // Convert the XML string to a Uint8Array
      const xmlBuffer = new TextEncoder().encode(xmlContent);

-      // Make sure filename is lowercase (as required by documentation)
-      filename = filename.toLowerCase();
-
-      // Use pdf-lib's .attach() to embed the XML
-      pdfDoc.attach(xmlBuffer, filename, {
-        mimeType: 'text/xml',
-        description: description,
-        creationDate: new Date(),
-        modificationDate: new Date(),
-        afRelationship: AFRelationship.Alternative,
-      });
+      try {
+        // Use pdf-lib's .attach() to embed the XML
+        pdfDoc.attach(xmlBuffer, filename, {
+          mimeType: 'text/xml',
+          description: description,
+          creationDate: new Date(),
+          modificationDate: new Date(),
+          afRelationship: AFRelationship.Alternative,
+        });
+      } catch (error) {
+        return this.createErrorResult(
+          PDFEmbedError.EMBED_ERROR,
+          `Failed to embed XML: ${error instanceof Error ? error.message : String(error)}`,
+          error instanceof Error ? error : undefined
+        );
+      }

      // Save the modified PDF
-      const modifiedPdfBytes = await pdfDoc.save();
+      let modifiedPdfBytes: Uint8Array;
+      try {
+        modifiedPdfBytes = await pdfDoc.save({
+          addDefaultPage: false,           // Don't add a page if the document is empty
+          useObjectStreams: false,        // Better compatibility with older PDF readers
+          updateFieldAppearances: false   // Don't update form fields
+        });
+      } catch (error) {
+        return this.createErrorResult(
+          PDFEmbedError.SAVE_ERROR,
+          `Failed to save modified PDF: ${error instanceof Error ? error.message : String(error)}`,
+          error instanceof Error ? error : undefined
+        );
+      }

-      return modifiedPdfBytes;
+      return {
+        success: true,
+        data: modifiedPdfBytes
+      };
    } catch (error) {
-      console.error('Error embedding XML into PDF:', error);
-      throw error;
+      // Catch any uncaught errors
+      return this.createErrorResult(
+        PDFEmbedError.EMBED_ERROR,
+        `Unexpected error during XML embedding: ${error instanceof Error ? error.message : String(error)}`,
+        error instanceof Error ? error : undefined
+      );
    }
  }

@@ -56,7 +133,7 @@ export class PDFEmbedder {
   * @param description Description for the embedded XML
   * @param pdfName Name for the PDF
   * @param pdfId ID for the PDF
-   * @returns IPdf object with embedded XML
+   * @returns Result with either IPdf object or error information
   */
  public async createPdfWithXml(
    pdfBuffer: Uint8Array | Buffer,
@@ -65,16 +142,101 @@ export class PDFEmbedder {
    description: string = 'XML Invoice',
    pdfName: string = 'invoice.pdf',
    pdfId: string = `invoice-${Date.now()}`
-  ): Promise<IPdf> {
-    const modifiedPdfBytes = await this.embedXml(pdfBuffer, xmlContent, filename, description);
+  ): Promise<PDFEmbedResult> {
+    // Embed XML into PDF
+    const embedResult = await this.embedXml(pdfBuffer, xmlContent, filename, description);
+    
+    // If embedding failed, return the error
+    if (!embedResult.success || !embedResult.data) {
+      return embedResult;
+    }

-    return {
+    // Create IPdf object
+    const pdfObject: IPdf = {
      name: pdfName,
      id: pdfId,
      metadata: {
-        textExtraction: ''
+        textExtraction: '',
+        format: this.detectPdfFormat(xmlContent),
+        embeddedXml: {
+          filename: filename,
+          description: description
+        }
      },
-      buffer: modifiedPdfBytes
+      buffer: embedResult.data
+    };
+
+    return {
+      success: true,
+      pdf: pdfObject
    };
  }
-}
+
+  /**
+   * Ensures the filename is normalized according to PDF/A requirements
+   * @param filename Filename to normalize
+   * @returns Normalized filename
+   */
+  private normalizeFilename(filename: string): string {
+    // Convert to lowercase
+    let normalized = filename.toLowerCase();
+    
+    // Ensure it has .xml extension
+    if (!normalized.endsWith('.xml')) {
+      normalized = normalized.replace(/\.[^/.]+$/, '') + '.xml';
+    }
+    
+    // Replace invalid characters
+    normalized = normalized.replace(/[^a-z0-9_.-]/g, '_');
+    
+    return normalized;
+  }
+
+  /**
+   * Tries to detect the format of the XML content
+   * @param xmlContent XML content
+   * @returns Format string or undefined
+   */
+  private detectPdfFormat(xmlContent: string): string | undefined {
+    if (xmlContent.includes('factur-x.eu') || xmlContent.includes('factur-x.xml')) {
+      return 'factur-x';
+    } else if (xmlContent.includes('zugferd') || xmlContent.includes('ZUGFeRD')) {
+      return 'zugferd';
+    } else if (xmlContent.includes('xrechnung')) {
+      return 'xrechnung';
+    } else if (xmlContent.includes('<Invoice') || xmlContent.includes('<CreditNote')) {
+      return 'ubl';
+    } else if (xmlContent.includes('FatturaElettronica')) {
+      return 'fatturapa';
+    }
+    
+    return undefined;
+  }
+
+  /**
+   * Creates an error result object
+   * @param type Error type
+   * @param message Error message
+   * @param originalError Original error object
+   * @returns Error result
+   */
+  private createErrorResult(
+    type: PDFEmbedError,
+    message: string,
+    originalError?: Error
+  ): PDFEmbedResult {
+    console.error(`PDF Embedder Error (${type}): ${message}`);
+    if (originalError) {
+      console.error(originalError);
+    }
+    
+    return {
+      success: false,
+      error: {
+        type,
+        message,
+        originalError
+      }
+    };
+  }
+}
--- a/ts/formats/pdf/pdf.extractor.ts
+++ b/ts/formats/pdf/pdf.extractor.ts
@@ -4,6 +4,32 @@ import {
  AssociatedFilesExtractor,
  TextXMLExtractor
 } from './extractors/index.js';
+import { FormatDetector } from '../utils/format.detector.js';
+import { InvoiceFormat } from '../../interfaces/common.js';
+
+/**
+ * Error types for PDF extraction operations
+ */
+export enum PDFExtractError {
+  EXTRACT_ERROR = 'XML extraction failed',
+  INVALID_INPUT = 'Invalid input parameters',
+  NO_XML_FOUND = 'No XML found in PDF'
+}
+
+/**
+ * Result of a PDF extraction operation
+ */
+export interface PDFExtractResult {
+  success: boolean;
+  xml?: string;
+  format?: InvoiceFormat;
+  extractorUsed?: string;
+  error?: {
+    type: PDFExtractError;
+    message: string;
+    originalError?: Error;
+  };
+}

 /**
 * Main PDF extractor class that orchestrates the extraction process
@@ -18,9 +44,9 @@ export class PDFExtractor {
  constructor() {
    // Add extractors in order of preference/likelihood of success
    this.extractors.push(
-      new StandardXMLExtractor(),    // Standard PDF/A-3 embedded files
-      new AssociatedFilesExtractor(), // Associated files (ZUGFeRD v1, some Factur-X)
-      new TextXMLExtractor()          // Text-based extraction (fallback)
+      new StandardXMLExtractor(),      // Standard PDF/A-3 embedded files
+      new AssociatedFilesExtractor(),  // Associated files (ZUGFeRD v1, some Factur-X)
+      new TextXMLExtractor()           // Text-based extraction (fallback)
    );
  }

@@ -28,36 +54,88 @@ export class PDFExtractor {
   * Extract XML from a PDF buffer
   * Tries multiple extraction methods in sequence
   * @param pdfBuffer PDF buffer
-   * @returns XML content or null if not found
+   * @returns Result with either the extracted XML or error information
   */
-  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<string | null> {
+  public async extractXml(pdfBuffer: Uint8Array | Buffer): Promise<PDFExtractResult> {
    try {
      console.log('Starting XML extraction from PDF...');

+      // Validate input
+      if (!pdfBuffer || pdfBuffer.length === 0) {
+        return this.createErrorResult(PDFExtractError.INVALID_INPUT, 'PDF buffer is empty or undefined');
+      }
+
+      // Ensure buffer is Uint8Array
+      const pdfBufferArray = Buffer.isBuffer(pdfBuffer) ? new Uint8Array(pdfBuffer) : pdfBuffer;
+
      // Try each extractor in sequence
      for (const extractor of this.extractors) {
        const extractorName = extractor.constructor.name;
        console.log(`Trying extraction with ${extractorName}...`);

-        const xml = await extractor.extractXml(pdfBuffer);
-        if (xml) {
-          console.log(`Successfully extracted XML using ${extractorName}`);
-          return xml;
+        try {
+          const xml = await extractor.extractXml(pdfBufferArray);
+          
+          if (xml) {
+            console.log(`Successfully extracted XML using ${extractorName}`);
+            
+            // Detect format of the extracted XML
+            const format = FormatDetector.detectFormat(xml);
+            
+            return {
+              success: true,
+              xml,
+              format,
+              extractorUsed: extractorName
+            };
+          }
+          
+          console.log(`Extraction with ${extractorName} failed, trying next method...`);
+        } catch (error) {
+          // Log error but continue with next extractor
+          console.warn(`Error using ${extractorName}: ${error instanceof Error ? error.message : String(error)}`);
        }
-
-        console.log(`Extraction with ${extractorName} failed, trying next method...`);
      }

-      // If all extractors fail, return null
-      console.warn('All extraction methods failed, no valid XML found in PDF');
-      return null;
+      // If all extractors fail, return a no XML found error
+      return this.createErrorResult(
+        PDFExtractError.NO_XML_FOUND,
+        'All extraction methods failed, no valid XML found in PDF'
+      );
    } catch (error) {
-      console.error('Error extracting XML from PDF:', error);
-      return null;
+      // Handle any unexpected errors
+      return this.createErrorResult(
+        PDFExtractError.EXTRACT_ERROR,
+        `Unexpected error during XML extraction: ${error instanceof Error ? error.message : String(error)}`,
+        error instanceof Error ? error : undefined
+      );
    }
  }

-
-
-
-}
+  /**
+   * Create a PDF extract result with error information
+   * @param type Error type
+   * @param message Error message
+   * @param originalError Original error object
+   * @returns Error result
+   */
+  private createErrorResult(
+    type: PDFExtractError,
+    message: string,
+    originalError?: Error
+  ): PDFExtractResult {
+    console.error(`PDF Extractor Error (${type}): ${message}`);
+    if (originalError) {
+      console.error(originalError);
+    }
+    
+    return {
+      success: false,
+      error: {
+        type,
+        message,
+        originalError
+      }
+    };
+  }
+}