smartdata/ts/classes.lucene.adapter.ts

/**
 * Lucene to MongoDB query adapter for SmartData
 */
import * as plugins from './plugins.js';

// Types
type NodeType = 'TERM' | 'PHRASE' | 'FIELD' | 'AND' | 'OR' | 'NOT' | 'RANGE' | 'WILDCARD' | 'FUZZY' | 'GROUP';

interface QueryNode {
  type: NodeType;
}

interface TermNode extends QueryNode {
  type: 'TERM';
  value: string;
  boost?: number;
}

interface PhraseNode extends QueryNode {
  type: 'PHRASE';
  value: string;
  proximity?: number;
}

interface FieldNode extends QueryNode {
  type: 'FIELD';
  field: string;
  value: AnyQueryNode;
}

interface BooleanNode extends QueryNode {
  type: 'AND' | 'OR' | 'NOT';
  left: AnyQueryNode;
  right: AnyQueryNode;
}

interface RangeNode extends QueryNode {
  type: 'RANGE';
  field: string;
  lower: string;
  upper: string;
  includeLower: boolean;
  includeUpper: boolean;
}

interface WildcardNode extends QueryNode {
  type: 'WILDCARD';
  value: string;
}

interface FuzzyNode extends QueryNode {
  type: 'FUZZY';
  value: string;
  maxEdits: number;
}

interface GroupNode extends QueryNode {
  type: 'GROUP';
  value: AnyQueryNode;
}

type AnyQueryNode = TermNode | PhraseNode | FieldNode | BooleanNode | RangeNode | WildcardNode | FuzzyNode | GroupNode;

/**
 * Lucene query parser
 */
export class LuceneParser {
  private pos: number = 0;
  private input: string = '';
  private tokens: string[] = [];

  constructor() {}

  /**
   * Parse a Lucene query string into an AST
   */
  parse(query: string): AnyQueryNode {
    this.input = query.trim();
    this.pos = 0;
    this.tokens = this.tokenize(this.input);

    return this.parseQuery();
  }

  /**
   * Tokenize the input string into tokens
   */
  private tokenize(input: string): string[] {
    const specialChars = /[()\[\]{}"~^:]/;
    const operators = /AND|OR|NOT|TO/;

    let tokens: string[] = [];
    let current = '';
    let inQuote = false;

    for (let i = 0; i < input.length; i++) {
      const char = input[i];

      // Handle quoted strings
      if (char === '"') {
        if (inQuote) {
          tokens.push(current + char);
          current = '';
          inQuote = false;
        } else {
          if (current) tokens.push(current);
          current = char;
          inQuote = true;
        }
        continue;
      }

      if (inQuote) {
        current += char;
        continue;
      }

      // Handle whitespace
      if (char === ' ' || char === '\t' || char === '\n') {
        if (current) {
          tokens.push(current);
          current = '';
        }
        continue;
      }

      // Handle special characters
      if (specialChars.test(char)) {
        if (current) {
          tokens.push(current);
          current = '';
        }
        tokens.push(char);
        continue;
      }

      current += char;

      // Check if current is an operator
      if (operators.test(current) &&
          (i + 1 === input.length || /\s/.test(input[i + 1]))) {
        tokens.push(current);
        current = '';
      }
    }

    if (current) tokens.push(current);

    return tokens;
  }

  /**
   * Parse the main query expression
   */
  private parseQuery(): AnyQueryNode {
    const left = this.parseBooleanOperand();

    if (this.pos < this.tokens.length) {
      const token = this.tokens[this.pos];

      if (token === 'AND' || token === 'OR') {
        this.pos++;
        const right = this.parseQuery();
        return {
          type: token as 'AND' | 'OR',
          left,
          right
        };
      } else if (token === 'NOT' || token === '-') {
        this.pos++;
        const right = this.parseQuery();
        return {
          type: 'NOT',
          left,
          right
        };
      }
    }

    return left;
  }

  /**
   * Parse boolean operands (terms, phrases, fields, groups)
   */
  private parseBooleanOperand(): AnyQueryNode {
    if (this.pos >= this.tokens.length) {
      throw new Error('Unexpected end of input');
    }

    const token = this.tokens[this.pos];

    // Handle grouping with parentheses
    if (token === '(') {
      this.pos++;
      const group = this.parseQuery();

      if (this.pos < this.tokens.length && this.tokens[this.pos] === ')') {
        this.pos++;
        return { type: 'GROUP', value: group } as GroupNode;
      } else {
        throw new Error('Unclosed group');
      }
    }

    // Handle fields (field:value)
    if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === ':') {
      const field = token;
      this.pos += 2; // Skip field and colon

      if (this.pos < this.tokens.length) {
        const value = this.parseBooleanOperand();
        return { type: 'FIELD', field, value } as FieldNode;
      } else {
        throw new Error('Expected value after field');
      }
    }

    // Handle range queries
    if (token === '[' || token === '{') {
      return this.parseRange();
    }

    // Handle phrases ("term term")
    if (token.startsWith('"') && token.endsWith('"')) {
      const phrase = token.slice(1, -1);
      this.pos++;

      // Check for proximity operator
      let proximity: number | undefined;
      if (this.pos < this.tokens.length && this.tokens[this.pos] === '~') {
        this.pos++;
        if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
          proximity = parseInt(this.tokens[this.pos], 10);
          this.pos++;
        } else {
          throw new Error('Expected number after proximity operator');
        }
      }

      return { type: 'PHRASE', value: phrase, proximity } as PhraseNode;
    }

    // Handle wildcards
    if (token.includes('*') || token.includes('?')) {
      this.pos++;
      return { type: 'WILDCARD', value: token } as WildcardNode;
    }

    // Handle fuzzy searches
    if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === '~') {
      const term = token;
      this.pos += 2; // Skip term and tilde

      let maxEdits = 2; // Default
      if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
        maxEdits = parseInt(this.tokens[this.pos], 10);
        this.pos++;
      }

      return { type: 'FUZZY', value: term, maxEdits } as FuzzyNode;
    }

    // Simple term
    this.pos++;
    return { type: 'TERM', value: token } as TermNode;
  }

  /**
   * Parse range queries
   */
  private parseRange(): RangeNode {
    const includeLower = this.tokens[this.pos] === '[';
    const includeUpper = this.tokens[this.pos + 4] === ']';

    this.pos++; // Skip open bracket

    if (this.pos + 4 >= this.tokens.length) {
      throw new Error('Invalid range query syntax');
    }

    const lower = this.tokens[this.pos];
    this.pos++;

    if (this.tokens[this.pos] !== 'TO') {
      throw new Error('Expected TO in range query');
    }
    this.pos++;

    const upper = this.tokens[this.pos];
    this.pos++;

    if (this.tokens[this.pos] !== (includeLower ? ']' : '}')) {
      throw new Error('Invalid range query closing bracket');
    }
    this.pos++;

    // For simplicity, assuming the field is handled separately
    return {
      type: 'RANGE',
      field: '', // This will be filled by the field node
      lower,
      upper,
      includeLower,
      includeUpper
    };
  }
}

/**
 * Transformer for Lucene AST to MongoDB query
 */
export class LuceneToMongoTransformer {
  constructor() {}

  /**
   * Transform a Lucene AST node to a MongoDB query
   */
  transform(node: AnyQueryNode, searchFields?: string[]): any {
    switch (node.type) {
      case 'TERM':
        return this.transformTerm(node, searchFields);
      case 'PHRASE':
        return this.transformPhrase(node, searchFields);
      case 'FIELD':
        return this.transformField(node);
      case 'AND':
        return this.transformAnd(node);
      case 'OR':
        return this.transformOr(node);
      case 'NOT':
        return this.transformNot(node);
      case 'RANGE':
        return this.transformRange(node);
      case 'WILDCARD':
        return this.transformWildcard(node, searchFields);
      case 'FUZZY':
        return this.transformFuzzy(node, searchFields);
      case 'GROUP':
        return this.transform(node.value, searchFields);
      default:
        throw new Error(`Unsupported node type: ${(node as any).type}`);
    }
  }

  /**
   * Transform a term to MongoDB query
   */
  private transformTerm(node: TermNode, searchFields?: string[]): any {
    // If specific fields are provided, search across those fields
    if (searchFields && searchFields.length > 0) {
      // Create an $or query to search across multiple fields
      return {
        $or: searchFields.map(field => ({
          [field]: { $regex: node.value, $options: 'i' }
        }))
      };
    }

    // Otherwise, use text search (requires a text index on desired fields)
    return { $text: { $search: node.value } };
  }

  /**
   * Transform a phrase to MongoDB query
   */
  private transformPhrase(node: PhraseNode, searchFields?: string[]): any {
    // If specific fields are provided, search phrase across those fields
    if (searchFields && searchFields.length > 0) {
      // Create an $or query to search phrase across multiple fields
      return {
        $or: searchFields.map(field => ({
          [field]: { $regex: `${node.value.replace(/\s+/g, '\\s+')}`, $options: 'i' }
        }))
      };
    }

    // For phrases, we use a regex to ensure exact matches
    return { $text: { $search: `"${node.value}"` } };
  }

  /**
   * Transform a field query to MongoDB query
   */
  private transformField(node: FieldNode): any {
    // Handle special case for range queries on fields
    if (node.value.type === 'RANGE') {
      const rangeNode = node.value as RangeNode;
      rangeNode.field = node.field;
      return this.transformRange(rangeNode);
    }

    // Handle special case for wildcards on fields
    if (node.value.type === 'WILDCARD') {
      return {
        [node.field]: {
          $regex: this.luceneWildcardToRegex((node.value as WildcardNode).value),
          $options: 'i'
        }
      };
    }

    // Handle special case for fuzzy searches on fields
    if (node.value.type === 'FUZZY') {
      return {
        [node.field]: {
          $regex: this.createFuzzyRegex((node.value as FuzzyNode).value),
          $options: 'i'
        }
      };
    }

    // Special case for exact term matches on fields
    if (node.value.type === 'TERM') {
      return { [node.field]: (node.value as TermNode).value };
    }

    // Special case for phrase matches on fields
    if (node.value.type === 'PHRASE') {
      return {
        [node.field]: {
          $regex: `^${(node.value as PhraseNode).value}$`,
          $options: 'i'
        }
      };
    }

    // For other cases, we'll transform the value and apply it to the field
    const transformedValue = this.transform(node.value);

    // If the transformed value uses $text, we need to adapt it for the field
    if (transformedValue.$text) {
      return { [node.field]: transformedValue.$text.$search };
    }

    return { [node.field]: transformedValue };
  }

  /**
   * Transform AND operator to MongoDB query
   */
  private transformAnd(node: BooleanNode): any {
    return { $and: [this.transform(node.left), this.transform(node.right)] };
  }

  /**
   * Transform OR operator to MongoDB query
   */
  private transformOr(node: BooleanNode): any {
    return { $or: [this.transform(node.left), this.transform(node.right)] };
  }

  /**
   * Transform NOT operator to MongoDB query
   */
  private transformNot(node: BooleanNode): any {
    const leftQuery = this.transform(node.left);
    const rightQuery = this.transform(node.right);

    // Create a query that includes left but excludes right
    if (rightQuery.$text) {
      // Text searches need special handling for negation
      return {
        $and: [
          leftQuery,
          { $not: rightQuery }
        ]
      };
    } else {
      // For other queries, we can use $not directly
      return {
        $and: [
          leftQuery,
          { $not: rightQuery }
        ]
      };
    }
  }

  /**
   * Transform range query to MongoDB query
   */
  private transformRange(node: RangeNode): any {
    const range: any = {};

    if (node.lower !== '*') {
      range[node.includeLower ? '$gte' : '$gt'] = this.parseValue(node.lower);
    }

    if (node.upper !== '*') {
      range[node.includeUpper ? '$lte' : '$lt'] = this.parseValue(node.upper);
    }

    return { [node.field]: range };
  }

  /**
   * Transform wildcard query to MongoDB query
   */
  private transformWildcard(node: WildcardNode, searchFields?: string[]): any {
    // Convert Lucene wildcards to MongoDB regex
    const regex = this.luceneWildcardToRegex(node.value);

    // If specific fields are provided, search wildcard across those fields
    if (searchFields && searchFields.length > 0) {
      return {
        $or: searchFields.map(field => ({
          [field]: { $regex: regex, $options: 'i' }
        }))
      };
    }

    // By default, apply to all text fields using $text search
    return { $regex: regex, $options: 'i' };
  }

  /**
   * Transform fuzzy query to MongoDB query
   */
  private transformFuzzy(node: FuzzyNode, searchFields?: string[]): any {
    // MongoDB doesn't have built-in fuzzy search
    // This is a very basic approach using regex
    const regex = this.createFuzzyRegex(node.value);

    // If specific fields are provided, search fuzzy term across those fields
    if (searchFields && searchFields.length > 0) {
      return {
        $or: searchFields.map(field => ({
          [field]: { $regex: regex, $options: 'i' }
        }))
      };
    }

    return { $regex: regex, $options: 'i' };
  }

  /**
   * Convert Lucene wildcards to MongoDB regex patterns
   */
  private luceneWildcardToRegex(wildcardPattern: string): string {
    // Replace Lucene wildcards with regex equivalents
    // * => .*
    // ? => .
    // Also escape regex special chars
    return wildcardPattern
      .replace(/([.+^${}()|\\])/g, '\\$1') // Escape regex special chars
      .replace(/\*/g, '.*')
      .replace(/\?/g, '.');
  }

  /**
   * Create a simplified fuzzy search regex
   */
  private createFuzzyRegex(term: string): string {
    // For a very simple approach, we allow some characters to be optional
    let regex = '';
    for (let i = 0; i < term.length; i++) {
      // Make every other character optional (simplified fuzzy)
      if (i % 2 === 1) {
        regex += term[i] + '?';
      } else {
        regex += term[i];
      }
    }
    return regex;
  }

  /**
   * Parse string values to appropriate types (numbers, dates, etc.)
   */
  private parseValue(value: string): any {
    // Try to parse as number
    if (/^-?\d+$/.test(value)) {
      return parseInt(value, 10);
    }

    if (/^-?\d+\.\d+$/.test(value)) {
      return parseFloat(value);
    }

    // Try to parse as date (simplified)
    const date = new Date(value);
    if (!isNaN(date.getTime())) {
      return date;
    }

    // Default to string
    return value;
  }
}

/**
 * Main adapter class
 */
export class SmartdataLuceneAdapter {
  private parser: LuceneParser;
  private transformer: LuceneToMongoTransformer;
  private defaultSearchFields: string[] = [];

  /**
   * @param defaultSearchFields - Optional array of field names to search across when no field is specified
   */
  constructor(defaultSearchFields?: string[]) {
    this.parser = new LuceneParser();
    this.transformer = new LuceneToMongoTransformer();
    if (defaultSearchFields) {
      this.defaultSearchFields = defaultSearchFields;
    }
  }

  /**
   * Convert a Lucene query string to a MongoDB query object
   * @param luceneQuery - The Lucene query string to convert
   * @param searchFields - Optional array of field names to search across (overrides defaultSearchFields)
   */
  convert(luceneQuery: string, searchFields?: string[]): any {
    try {
      // Parse the Lucene query into an AST
      const ast = this.parser.parse(luceneQuery);

      // Use provided searchFields, fall back to defaultSearchFields
      const fieldsToSearch = searchFields || this.defaultSearchFields;

      // Transform the AST to a MongoDB query
      return this.transformWithFields(ast, fieldsToSearch);
    } catch (error) {
      throw new Error(`Failed to convert Lucene query: ${error}`);
    }
  }

  /**
   * Helper method to transform the AST with field information
   */
  private transformWithFields(node: AnyQueryNode, searchFields: string[]): any {
    // For term nodes without a specific field, apply the search fields
    if (node.type === 'TERM') {
      return this.transformer.transform(node, searchFields);
    }

    // For other node types, use the standard transformation
    return this.transformer.transform(node);
  }
}