/** * Lucene to MongoDB query adapter for SmartData */ import * as plugins from './plugins.js'; // Types type NodeType = | 'TERM' | 'PHRASE' | 'FIELD' | 'AND' | 'OR' | 'NOT' | 'RANGE' | 'WILDCARD' | 'FUZZY' | 'GROUP'; interface QueryNode { type: NodeType; } interface TermNode extends QueryNode { type: 'TERM'; value: string; boost?: number; } interface PhraseNode extends QueryNode { type: 'PHRASE'; value: string; proximity?: number; } interface FieldNode extends QueryNode { type: 'FIELD'; field: string; value: AnyQueryNode; } interface BooleanNode extends QueryNode { type: 'AND' | 'OR' | 'NOT'; left: AnyQueryNode; right: AnyQueryNode; } interface RangeNode extends QueryNode { type: 'RANGE'; field: string; lower: string; upper: string; includeLower: boolean; includeUpper: boolean; } interface WildcardNode extends QueryNode { type: 'WILDCARD'; value: string; } interface FuzzyNode extends QueryNode { type: 'FUZZY'; value: string; maxEdits: number; } interface GroupNode extends QueryNode { type: 'GROUP'; value: AnyQueryNode; } type AnyQueryNode = | TermNode | PhraseNode | FieldNode | BooleanNode | RangeNode | WildcardNode | FuzzyNode | GroupNode; /** * Lucene query parser */ export class LuceneParser { private pos: number = 0; private input: string = ''; private tokens: string[] = []; constructor() {} /** * Parse a Lucene query string into an AST */ parse(query: string): AnyQueryNode { this.input = query.trim(); this.pos = 0; this.tokens = this.tokenize(this.input); return this.parseQuery(); } /** * Tokenize the input string into tokens */ private tokenize(input: string): string[] { const specialChars = /[()\[\]{}"~^:]/; const operators = /AND|OR|NOT|TO/; let tokens: string[] = []; let current = ''; let inQuote = false; for (let i = 0; i < input.length; i++) { const char = input[i]; // Handle quoted strings if (char === '"') { if (inQuote) { tokens.push(current + char); current = ''; inQuote = false; } else { if (current) tokens.push(current); current = char; inQuote = true; } continue; } if (inQuote) { current += char; continue; } // Handle whitespace if (char === ' ' || char === '\t' || char === '\n') { if (current) { tokens.push(current); current = ''; } continue; } // Handle special characters if (specialChars.test(char)) { if (current) { tokens.push(current); current = ''; } tokens.push(char); continue; } current += char; // Check if current is an operator if (operators.test(current) && (i + 1 === input.length || /\s/.test(input[i + 1]))) { tokens.push(current); current = ''; } } if (current) tokens.push(current); return tokens; } /** * Parse the main query expression */ private parseQuery(): AnyQueryNode { const left = this.parseBooleanOperand(); if (this.pos < this.tokens.length) { const token = this.tokens[this.pos]; if (token === 'AND' || token === 'OR') { this.pos++; const right = this.parseQuery(); return { type: token as 'AND' | 'OR', left, right, }; } else if (token === 'NOT' || token === '-') { this.pos++; const right = this.parseQuery(); return { type: 'NOT', left, right, }; } } return left; } /** * Parse boolean operands (terms, phrases, fields, groups) */ private parseBooleanOperand(): AnyQueryNode { if (this.pos >= this.tokens.length) { throw new Error('Unexpected end of input'); } const token = this.tokens[this.pos]; // Handle grouping with parentheses if (token === '(') { this.pos++; const group = this.parseQuery(); if (this.pos < this.tokens.length && this.tokens[this.pos] === ')') { this.pos++; return { type: 'GROUP', value: group } as GroupNode; } else { throw new Error('Unclosed group'); } } // Handle fields (field:value) if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === ':') { const field = token; this.pos += 2; // Skip field and colon if (this.pos < this.tokens.length) { const value = this.parseBooleanOperand(); return { type: 'FIELD', field, value } as FieldNode; } else { throw new Error('Expected value after field'); } } // Handle range queries if (token === '[' || token === '{') { return this.parseRange(); } // Handle phrases ("term term") if (token.startsWith('"') && token.endsWith('"')) { const phrase = token.slice(1, -1); this.pos++; // Check for proximity operator let proximity: number | undefined; if (this.pos < this.tokens.length && this.tokens[this.pos] === '~') { this.pos++; if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) { proximity = parseInt(this.tokens[this.pos], 10); this.pos++; } else { throw new Error('Expected number after proximity operator'); } } return { type: 'PHRASE', value: phrase, proximity } as PhraseNode; } // Handle wildcards if (token.includes('*') || token.includes('?')) { this.pos++; return { type: 'WILDCARD', value: token } as WildcardNode; } // Handle fuzzy searches if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === '~') { const term = token; this.pos += 2; // Skip term and tilde let maxEdits = 2; // Default if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) { maxEdits = parseInt(this.tokens[this.pos], 10); this.pos++; } return { type: 'FUZZY', value: term, maxEdits } as FuzzyNode; } // Simple term this.pos++; return { type: 'TERM', value: token } as TermNode; } /** * Parse range queries */ private parseRange(): RangeNode { const includeLower = this.tokens[this.pos] === '['; const includeUpper = this.tokens[this.pos + 4] === ']'; this.pos++; // Skip open bracket if (this.pos + 4 >= this.tokens.length) { throw new Error('Invalid range query syntax'); } const lower = this.tokens[this.pos]; this.pos++; if (this.tokens[this.pos] !== 'TO') { throw new Error('Expected TO in range query'); } this.pos++; const upper = this.tokens[this.pos]; this.pos++; if (this.tokens[this.pos] !== (includeLower ? ']' : '}')) { throw new Error('Invalid range query closing bracket'); } this.pos++; // For simplicity, assuming the field is handled separately return { type: 'RANGE', field: '', // This will be filled by the field node lower, upper, includeLower, includeUpper, }; } } /** * Transformer for Lucene AST to MongoDB query * FIXED VERSION - proper MongoDB query structure */ export class LuceneToMongoTransformer { constructor() {} /** * Transform a Lucene AST node to a MongoDB query */ transform(node: AnyQueryNode, searchFields?: string[]): any { switch (node.type) { case 'TERM': return this.transformTerm(node, searchFields); case 'PHRASE': return this.transformPhrase(node, searchFields); case 'FIELD': return this.transformField(node); case 'AND': return this.transformAnd(node); case 'OR': return this.transformOr(node); case 'NOT': return this.transformNot(node); case 'RANGE': return this.transformRange(node); case 'WILDCARD': return this.transformWildcard(node, searchFields); case 'FUZZY': return this.transformFuzzy(node, searchFields); case 'GROUP': return this.transform(node.value, searchFields); default: throw new Error(`Unsupported node type: ${(node as any).type}`); } } /** * Transform a term to MongoDB query * FIXED: properly structured $or query for multiple fields */ private transformTerm(node: TermNode, searchFields?: string[]): any { // If specific fields are provided, search across those fields if (searchFields && searchFields.length > 0) { // Create an $or query to search across multiple fields const orConditions = searchFields.map((field) => ({ [field]: { $regex: node.value, $options: 'i' }, })); return { $or: orConditions }; } // Otherwise, use text search (requires a text index on desired fields) return { $text: { $search: node.value } }; } /** * Transform a phrase to MongoDB query * FIXED: properly structured $or query for multiple fields */ private transformPhrase(node: PhraseNode, searchFields?: string[]): any { // If specific fields are provided, search phrase across those fields if (searchFields && searchFields.length > 0) { const orConditions = searchFields.map((field) => ({ [field]: { $regex: `${node.value.replace(/\s+/g, '\\s+')}`, $options: 'i' }, })); return { $or: orConditions }; } // For phrases, we use a regex to ensure exact matches return { $text: { $search: `"${node.value}"` } }; } /** * Transform a field query to MongoDB query */ private transformField(node: FieldNode): any { // Handle special case for range queries on fields if (node.value.type === 'RANGE') { const rangeNode = node.value as RangeNode; rangeNode.field = node.field; return this.transformRange(rangeNode); } // Handle special case for wildcards on fields if (node.value.type === 'WILDCARD') { return { [node.field]: { $regex: this.luceneWildcardToRegex((node.value as WildcardNode).value), $options: 'i', }, }; } // Handle special case for fuzzy searches on fields if (node.value.type === 'FUZZY') { return { [node.field]: { $regex: this.createFuzzyRegex((node.value as FuzzyNode).value), $options: 'i', }, }; } // Special case for exact term matches on fields if (node.value.type === 'TERM') { return { [node.field]: { $regex: (node.value as TermNode).value, $options: 'i' } }; } // Special case for phrase matches on fields if (node.value.type === 'PHRASE') { return { [node.field]: { $regex: `${(node.value as PhraseNode).value.replace(/\s+/g, '\\s+')}`, $options: 'i', }, }; } // For other cases, we'll transform the value and apply it to the field const transformedValue = this.transform(node.value); // If the transformed value uses $text, we need to adapt it for the field if (transformedValue.$text) { return { [node.field]: { $regex: transformedValue.$text.$search, $options: 'i' } }; } // Handle $or and $and cases if (transformedValue.$or || transformedValue.$and) { // This is a bit complex - we need to restructure the query to apply the field // For now, simplify by just using a regex on the field const term = this.extractTermFromBooleanQuery(transformedValue); if (term) { return { [node.field]: { $regex: term, $options: 'i' } }; } } return { [node.field]: transformedValue }; } /** * Extract a term from a boolean query (simplification) */ private extractTermFromBooleanQuery(query: any): string | null { if (query.$or && Array.isArray(query.$or) && query.$or.length > 0) { const firstClause = query.$or[0]; for (const field in firstClause) { if (firstClause[field].$regex) { return firstClause[field].$regex; } } } if (query.$and && Array.isArray(query.$and) && query.$and.length > 0) { const firstClause = query.$and[0]; for (const field in firstClause) { if (firstClause[field].$regex) { return firstClause[field].$regex; } } } return null; } /** * Transform AND operator to MongoDB query * FIXED: $and must be an array */ private transformAnd(node: BooleanNode): any { return { $and: [this.transform(node.left), this.transform(node.right)] }; } /** * Transform OR operator to MongoDB query * FIXED: $or must be an array */ private transformOr(node: BooleanNode): any { return { $or: [this.transform(node.left), this.transform(node.right)] }; } /** * Transform NOT operator to MongoDB query * FIXED: $and must be an array and $not usage */ private transformNot(node: BooleanNode): any { const leftQuery = this.transform(node.left); const rightQuery = this.transform(node.right); // Create a query that includes left but excludes right if (rightQuery.$text) { // For text searches, we need a different approach // We'll use a negated regex instead const searchTerm = rightQuery.$text.$search.replace(/"/g, ''); // Determine the fields to apply the negation to const notConditions = []; for (const field in leftQuery) { if (field !== '$or' && field !== '$and') { notConditions.push({ [field]: { $not: { $regex: searchTerm, $options: 'i' } }, }); } } // If left query has $or or $and, we need to handle it differently if (leftQuery.$or) { return { $and: [leftQuery, { $nor: [{ $or: notConditions }] }], }; } else { // Simple case - just add $not to each field return { $and: [leftQuery, { $and: notConditions }], }; } } else { // For other queries, we can use $not directly // We need to handle different structures based on the rightQuery let notQuery = {}; if (rightQuery.$or) { notQuery = { $nor: rightQuery.$or }; } else if (rightQuery.$and) { // Convert $and to $nor notQuery = { $nor: rightQuery.$and }; } else { // Simple field condition for (const field in rightQuery) { notQuery[field] = { $not: rightQuery[field] }; } } return { $and: [leftQuery, notQuery] }; } } /** * Transform range query to MongoDB query */ private transformRange(node: RangeNode): any { const range: any = {}; if (node.lower !== '*') { range[node.includeLower ? '$gte' : '$gt'] = this.parseValue(node.lower); } if (node.upper !== '*') { range[node.includeUpper ? '$lte' : '$lt'] = this.parseValue(node.upper); } return { [node.field]: range }; } /** * Transform wildcard query to MongoDB query * FIXED: properly structured for multiple fields */ private transformWildcard(node: WildcardNode, searchFields?: string[]): any { // Convert Lucene wildcards to MongoDB regex const regex = this.luceneWildcardToRegex(node.value); // If specific fields are provided, search wildcard across those fields if (searchFields && searchFields.length > 0) { const orConditions = searchFields.map((field) => ({ [field]: { $regex: regex, $options: 'i' }, })); return { $or: orConditions }; } // By default, apply to the default field return { $regex: regex, $options: 'i' }; } /** * Transform fuzzy query to MongoDB query * FIXED: properly structured for multiple fields */ private transformFuzzy(node: FuzzyNode, searchFields?: string[]): any { // MongoDB doesn't have built-in fuzzy search // This is a very basic approach using regex const regex = this.createFuzzyRegex(node.value); // If specific fields are provided, search fuzzy term across those fields if (searchFields && searchFields.length > 0) { const orConditions = searchFields.map((field) => ({ [field]: { $regex: regex, $options: 'i' }, })); return { $or: orConditions }; } // By default, apply to the default field return { $regex: regex, $options: 'i' }; } /** * Convert Lucene wildcards to MongoDB regex patterns */ private luceneWildcardToRegex(wildcardPattern: string): string { // Replace Lucene wildcards with regex equivalents // * => .* // ? => . // Also escape regex special chars return wildcardPattern .replace(/([.+^${}()|\\])/g, '\\$1') // Escape regex special chars .replace(/\*/g, '.*') .replace(/\?/g, '.'); } /** * Create a simplified fuzzy search regex */ private createFuzzyRegex(term: string): string { // For a very simple approach, we allow some characters to be optional let regex = ''; for (let i = 0; i < term.length; i++) { // Make every other character optional (simplified fuzzy) if (i % 2 === 1) { regex += term[i] + '?'; } else { regex += term[i]; } } return regex; } /** * Parse string values to appropriate types (numbers, dates, etc.) */ private parseValue(value: string): any { // Try to parse as number if (/^-?\d+$/.test(value)) { return parseInt(value, 10); } if (/^-?\d+\.\d+$/.test(value)) { return parseFloat(value); } // Try to parse as date (simplified) const date = new Date(value); if (!isNaN(date.getTime())) { return date; } // Default to string return value; } } /** * Main adapter class */ export class SmartdataLuceneAdapter { private parser: LuceneParser; private transformer: LuceneToMongoTransformer; private defaultSearchFields: string[] = []; /** * @param defaultSearchFields - Optional array of field names to search across when no field is specified */ constructor(defaultSearchFields?: string[]) { this.parser = new LuceneParser(); this.transformer = new LuceneToMongoTransformer(); if (defaultSearchFields) { this.defaultSearchFields = defaultSearchFields; } } /** * Convert a Lucene query string to a MongoDB query object * @param luceneQuery - The Lucene query string to convert * @param searchFields - Optional array of field names to search across (overrides defaultSearchFields) */ convert(luceneQuery: string, searchFields?: string[]): any { try { // For simple single term queries, create a simpler query structure if ( !luceneQuery.includes(':') && !luceneQuery.includes(' AND ') && !luceneQuery.includes(' OR ') && !luceneQuery.includes(' NOT ') && !luceneQuery.includes('(') && !luceneQuery.includes('[') ) { // This is a simple term, use a more direct approach const fieldsToSearch = searchFields || this.defaultSearchFields; if (fieldsToSearch && fieldsToSearch.length > 0) { return { $or: fieldsToSearch.map((field) => ({ [field]: { $regex: luceneQuery, $options: 'i' }, })), }; } } // For more complex queries, use the full parser // Parse the Lucene query into an AST const ast = this.parser.parse(luceneQuery); // Use provided searchFields, fall back to defaultSearchFields const fieldsToSearch = searchFields || this.defaultSearchFields; // Transform the AST to a MongoDB query return this.transformWithFields(ast, fieldsToSearch); } catch (error) { console.error(`Failed to convert Lucene query "${luceneQuery}":`, error); throw new Error(`Failed to convert Lucene query: ${error}`); } } /** * Helper method to transform the AST with field information */ private transformWithFields(node: AnyQueryNode, searchFields: string[]): any { // Special case for term nodes without a specific field if ( node.type === 'TERM' || node.type === 'PHRASE' || node.type === 'WILDCARD' || node.type === 'FUZZY' ) { return this.transformer.transform(node, searchFields); } // For other node types, use the standard transformation return this.transformer.transform(node); } }