643 lines
17 KiB
TypeScript
643 lines
17 KiB
TypeScript
/**
|
|
* Lucene to MongoDB query adapter for SmartData
|
|
*/
|
|
import * as plugins from './plugins.js';
|
|
|
|
// Types
|
|
type NodeType = 'TERM' | 'PHRASE' | 'FIELD' | 'AND' | 'OR' | 'NOT' | 'RANGE' | 'WILDCARD' | 'FUZZY' | 'GROUP';
|
|
|
|
interface QueryNode {
|
|
type: NodeType;
|
|
}
|
|
|
|
interface TermNode extends QueryNode {
|
|
type: 'TERM';
|
|
value: string;
|
|
boost?: number;
|
|
}
|
|
|
|
interface PhraseNode extends QueryNode {
|
|
type: 'PHRASE';
|
|
value: string;
|
|
proximity?: number;
|
|
}
|
|
|
|
interface FieldNode extends QueryNode {
|
|
type: 'FIELD';
|
|
field: string;
|
|
value: AnyQueryNode;
|
|
}
|
|
|
|
interface BooleanNode extends QueryNode {
|
|
type: 'AND' | 'OR' | 'NOT';
|
|
left: AnyQueryNode;
|
|
right: AnyQueryNode;
|
|
}
|
|
|
|
interface RangeNode extends QueryNode {
|
|
type: 'RANGE';
|
|
field: string;
|
|
lower: string;
|
|
upper: string;
|
|
includeLower: boolean;
|
|
includeUpper: boolean;
|
|
}
|
|
|
|
interface WildcardNode extends QueryNode {
|
|
type: 'WILDCARD';
|
|
value: string;
|
|
}
|
|
|
|
interface FuzzyNode extends QueryNode {
|
|
type: 'FUZZY';
|
|
value: string;
|
|
maxEdits: number;
|
|
}
|
|
|
|
interface GroupNode extends QueryNode {
|
|
type: 'GROUP';
|
|
value: AnyQueryNode;
|
|
}
|
|
|
|
type AnyQueryNode = TermNode | PhraseNode | FieldNode | BooleanNode | RangeNode | WildcardNode | FuzzyNode | GroupNode;
|
|
|
|
/**
|
|
* Lucene query parser
|
|
*/
|
|
export class LuceneParser {
|
|
private pos: number = 0;
|
|
private input: string = '';
|
|
private tokens: string[] = [];
|
|
|
|
constructor() {}
|
|
|
|
/**
|
|
* Parse a Lucene query string into an AST
|
|
*/
|
|
parse(query: string): AnyQueryNode {
|
|
this.input = query.trim();
|
|
this.pos = 0;
|
|
this.tokens = this.tokenize(this.input);
|
|
|
|
return this.parseQuery();
|
|
}
|
|
|
|
/**
|
|
* Tokenize the input string into tokens
|
|
*/
|
|
private tokenize(input: string): string[] {
|
|
const specialChars = /[()\[\]{}"~^:]/;
|
|
const operators = /AND|OR|NOT|TO/;
|
|
|
|
let tokens: string[] = [];
|
|
let current = '';
|
|
let inQuote = false;
|
|
|
|
for (let i = 0; i < input.length; i++) {
|
|
const char = input[i];
|
|
|
|
// Handle quoted strings
|
|
if (char === '"') {
|
|
if (inQuote) {
|
|
tokens.push(current + char);
|
|
current = '';
|
|
inQuote = false;
|
|
} else {
|
|
if (current) tokens.push(current);
|
|
current = char;
|
|
inQuote = true;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (inQuote) {
|
|
current += char;
|
|
continue;
|
|
}
|
|
|
|
// Handle whitespace
|
|
if (char === ' ' || char === '\t' || char === '\n') {
|
|
if (current) {
|
|
tokens.push(current);
|
|
current = '';
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Handle special characters
|
|
if (specialChars.test(char)) {
|
|
if (current) {
|
|
tokens.push(current);
|
|
current = '';
|
|
}
|
|
tokens.push(char);
|
|
continue;
|
|
}
|
|
|
|
current += char;
|
|
|
|
// Check if current is an operator
|
|
if (operators.test(current) &&
|
|
(i + 1 === input.length || /\s/.test(input[i + 1]))) {
|
|
tokens.push(current);
|
|
current = '';
|
|
}
|
|
}
|
|
|
|
if (current) tokens.push(current);
|
|
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* Parse the main query expression
|
|
*/
|
|
private parseQuery(): AnyQueryNode {
|
|
const left = this.parseBooleanOperand();
|
|
|
|
if (this.pos < this.tokens.length) {
|
|
const token = this.tokens[this.pos];
|
|
|
|
if (token === 'AND' || token === 'OR') {
|
|
this.pos++;
|
|
const right = this.parseQuery();
|
|
return {
|
|
type: token as 'AND' | 'OR',
|
|
left,
|
|
right
|
|
};
|
|
} else if (token === 'NOT' || token === '-') {
|
|
this.pos++;
|
|
const right = this.parseQuery();
|
|
return {
|
|
type: 'NOT',
|
|
left,
|
|
right
|
|
};
|
|
}
|
|
}
|
|
|
|
return left;
|
|
}
|
|
|
|
/**
|
|
* Parse boolean operands (terms, phrases, fields, groups)
|
|
*/
|
|
private parseBooleanOperand(): AnyQueryNode {
|
|
if (this.pos >= this.tokens.length) {
|
|
throw new Error('Unexpected end of input');
|
|
}
|
|
|
|
const token = this.tokens[this.pos];
|
|
|
|
// Handle grouping with parentheses
|
|
if (token === '(') {
|
|
this.pos++;
|
|
const group = this.parseQuery();
|
|
|
|
if (this.pos < this.tokens.length && this.tokens[this.pos] === ')') {
|
|
this.pos++;
|
|
return { type: 'GROUP', value: group } as GroupNode;
|
|
} else {
|
|
throw new Error('Unclosed group');
|
|
}
|
|
}
|
|
|
|
// Handle fields (field:value)
|
|
if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === ':') {
|
|
const field = token;
|
|
this.pos += 2; // Skip field and colon
|
|
|
|
if (this.pos < this.tokens.length) {
|
|
const value = this.parseBooleanOperand();
|
|
return { type: 'FIELD', field, value } as FieldNode;
|
|
} else {
|
|
throw new Error('Expected value after field');
|
|
}
|
|
}
|
|
|
|
// Handle range queries
|
|
if (token === '[' || token === '{') {
|
|
return this.parseRange();
|
|
}
|
|
|
|
// Handle phrases ("term term")
|
|
if (token.startsWith('"') && token.endsWith('"')) {
|
|
const phrase = token.slice(1, -1);
|
|
this.pos++;
|
|
|
|
// Check for proximity operator
|
|
let proximity: number | undefined;
|
|
if (this.pos < this.tokens.length && this.tokens[this.pos] === '~') {
|
|
this.pos++;
|
|
if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
|
|
proximity = parseInt(this.tokens[this.pos], 10);
|
|
this.pos++;
|
|
} else {
|
|
throw new Error('Expected number after proximity operator');
|
|
}
|
|
}
|
|
|
|
return { type: 'PHRASE', value: phrase, proximity } as PhraseNode;
|
|
}
|
|
|
|
// Handle wildcards
|
|
if (token.includes('*') || token.includes('?')) {
|
|
this.pos++;
|
|
return { type: 'WILDCARD', value: token } as WildcardNode;
|
|
}
|
|
|
|
// Handle fuzzy searches
|
|
if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === '~') {
|
|
const term = token;
|
|
this.pos += 2; // Skip term and tilde
|
|
|
|
let maxEdits = 2; // Default
|
|
if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
|
|
maxEdits = parseInt(this.tokens[this.pos], 10);
|
|
this.pos++;
|
|
}
|
|
|
|
return { type: 'FUZZY', value: term, maxEdits } as FuzzyNode;
|
|
}
|
|
|
|
// Simple term
|
|
this.pos++;
|
|
return { type: 'TERM', value: token } as TermNode;
|
|
}
|
|
|
|
/**
|
|
* Parse range queries
|
|
*/
|
|
private parseRange(): RangeNode {
|
|
const includeLower = this.tokens[this.pos] === '[';
|
|
const includeUpper = this.tokens[this.pos + 4] === ']';
|
|
|
|
this.pos++; // Skip open bracket
|
|
|
|
if (this.pos + 4 >= this.tokens.length) {
|
|
throw new Error('Invalid range query syntax');
|
|
}
|
|
|
|
const lower = this.tokens[this.pos];
|
|
this.pos++;
|
|
|
|
if (this.tokens[this.pos] !== 'TO') {
|
|
throw new Error('Expected TO in range query');
|
|
}
|
|
this.pos++;
|
|
|
|
const upper = this.tokens[this.pos];
|
|
this.pos++;
|
|
|
|
if (this.tokens[this.pos] !== (includeLower ? ']' : '}')) {
|
|
throw new Error('Invalid range query closing bracket');
|
|
}
|
|
this.pos++;
|
|
|
|
// For simplicity, assuming the field is handled separately
|
|
return {
|
|
type: 'RANGE',
|
|
field: '', // This will be filled by the field node
|
|
lower,
|
|
upper,
|
|
includeLower,
|
|
includeUpper
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transformer for Lucene AST to MongoDB query
|
|
*/
|
|
export class LuceneToMongoTransformer {
|
|
constructor() {}
|
|
|
|
/**
|
|
* Transform a Lucene AST node to a MongoDB query
|
|
*/
|
|
transform(node: AnyQueryNode, searchFields?: string[]): any {
|
|
switch (node.type) {
|
|
case 'TERM':
|
|
return this.transformTerm(node, searchFields);
|
|
case 'PHRASE':
|
|
return this.transformPhrase(node, searchFields);
|
|
case 'FIELD':
|
|
return this.transformField(node);
|
|
case 'AND':
|
|
return this.transformAnd(node);
|
|
case 'OR':
|
|
return this.transformOr(node);
|
|
case 'NOT':
|
|
return this.transformNot(node);
|
|
case 'RANGE':
|
|
return this.transformRange(node);
|
|
case 'WILDCARD':
|
|
return this.transformWildcard(node, searchFields);
|
|
case 'FUZZY':
|
|
return this.transformFuzzy(node, searchFields);
|
|
case 'GROUP':
|
|
return this.transform(node.value, searchFields);
|
|
default:
|
|
throw new Error(`Unsupported node type: ${(node as any).type}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transform a term to MongoDB query
|
|
*/
|
|
private transformTerm(node: TermNode, searchFields?: string[]): any {
|
|
// If specific fields are provided, search across those fields
|
|
if (searchFields && searchFields.length > 0) {
|
|
// Create an $or query to search across multiple fields
|
|
return {
|
|
$or: searchFields.map(field => ({
|
|
[field]: { $regex: node.value, $options: 'i' }
|
|
}))
|
|
};
|
|
}
|
|
|
|
// Otherwise, use text search (requires a text index on desired fields)
|
|
return { $text: { $search: node.value } };
|
|
}
|
|
|
|
/**
|
|
* Transform a phrase to MongoDB query
|
|
*/
|
|
private transformPhrase(node: PhraseNode, searchFields?: string[]): any {
|
|
// If specific fields are provided, search phrase across those fields
|
|
if (searchFields && searchFields.length > 0) {
|
|
// Create an $or query to search phrase across multiple fields
|
|
return {
|
|
$or: searchFields.map(field => ({
|
|
[field]: { $regex: `${node.value.replace(/\s+/g, '\\s+')}`, $options: 'i' }
|
|
}))
|
|
};
|
|
}
|
|
|
|
// For phrases, we use a regex to ensure exact matches
|
|
return { $text: { $search: `"${node.value}"` } };
|
|
}
|
|
|
|
/**
|
|
* Transform a field query to MongoDB query
|
|
*/
|
|
private transformField(node: FieldNode): any {
|
|
// Handle special case for range queries on fields
|
|
if (node.value.type === 'RANGE') {
|
|
const rangeNode = node.value as RangeNode;
|
|
rangeNode.field = node.field;
|
|
return this.transformRange(rangeNode);
|
|
}
|
|
|
|
// Handle special case for wildcards on fields
|
|
if (node.value.type === 'WILDCARD') {
|
|
return {
|
|
[node.field]: {
|
|
$regex: this.luceneWildcardToRegex((node.value as WildcardNode).value),
|
|
$options: 'i'
|
|
}
|
|
};
|
|
}
|
|
|
|
// Handle special case for fuzzy searches on fields
|
|
if (node.value.type === 'FUZZY') {
|
|
return {
|
|
[node.field]: {
|
|
$regex: this.createFuzzyRegex((node.value as FuzzyNode).value),
|
|
$options: 'i'
|
|
}
|
|
};
|
|
}
|
|
|
|
// Special case for exact term matches on fields
|
|
if (node.value.type === 'TERM') {
|
|
return { [node.field]: (node.value as TermNode).value };
|
|
}
|
|
|
|
// Special case for phrase matches on fields
|
|
if (node.value.type === 'PHRASE') {
|
|
return {
|
|
[node.field]: {
|
|
$regex: `^${(node.value as PhraseNode).value}$`,
|
|
$options: 'i'
|
|
}
|
|
};
|
|
}
|
|
|
|
// For other cases, we'll transform the value and apply it to the field
|
|
const transformedValue = this.transform(node.value);
|
|
|
|
// If the transformed value uses $text, we need to adapt it for the field
|
|
if (transformedValue.$text) {
|
|
return { [node.field]: transformedValue.$text.$search };
|
|
}
|
|
|
|
return { [node.field]: transformedValue };
|
|
}
|
|
|
|
/**
|
|
* Transform AND operator to MongoDB query
|
|
*/
|
|
private transformAnd(node: BooleanNode): any {
|
|
return { $and: [this.transform(node.left), this.transform(node.right)] };
|
|
}
|
|
|
|
/**
|
|
* Transform OR operator to MongoDB query
|
|
*/
|
|
private transformOr(node: BooleanNode): any {
|
|
return { $or: [this.transform(node.left), this.transform(node.right)] };
|
|
}
|
|
|
|
/**
|
|
* Transform NOT operator to MongoDB query
|
|
*/
|
|
private transformNot(node: BooleanNode): any {
|
|
const leftQuery = this.transform(node.left);
|
|
const rightQuery = this.transform(node.right);
|
|
|
|
// Create a query that includes left but excludes right
|
|
if (rightQuery.$text) {
|
|
// Text searches need special handling for negation
|
|
return {
|
|
$and: [
|
|
leftQuery,
|
|
{ $not: rightQuery }
|
|
]
|
|
};
|
|
} else {
|
|
// For other queries, we can use $not directly
|
|
return {
|
|
$and: [
|
|
leftQuery,
|
|
{ $not: rightQuery }
|
|
]
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transform range query to MongoDB query
|
|
*/
|
|
private transformRange(node: RangeNode): any {
|
|
const range: any = {};
|
|
|
|
if (node.lower !== '*') {
|
|
range[node.includeLower ? '$gte' : '$gt'] = this.parseValue(node.lower);
|
|
}
|
|
|
|
if (node.upper !== '*') {
|
|
range[node.includeUpper ? '$lte' : '$lt'] = this.parseValue(node.upper);
|
|
}
|
|
|
|
return { [node.field]: range };
|
|
}
|
|
|
|
/**
|
|
* Transform wildcard query to MongoDB query
|
|
*/
|
|
private transformWildcard(node: WildcardNode, searchFields?: string[]): any {
|
|
// Convert Lucene wildcards to MongoDB regex
|
|
const regex = this.luceneWildcardToRegex(node.value);
|
|
|
|
// If specific fields are provided, search wildcard across those fields
|
|
if (searchFields && searchFields.length > 0) {
|
|
return {
|
|
$or: searchFields.map(field => ({
|
|
[field]: { $regex: regex, $options: 'i' }
|
|
}))
|
|
};
|
|
}
|
|
|
|
// By default, apply to all text fields using $text search
|
|
return { $regex: regex, $options: 'i' };
|
|
}
|
|
|
|
/**
|
|
* Transform fuzzy query to MongoDB query
|
|
*/
|
|
private transformFuzzy(node: FuzzyNode, searchFields?: string[]): any {
|
|
// MongoDB doesn't have built-in fuzzy search
|
|
// This is a very basic approach using regex
|
|
const regex = this.createFuzzyRegex(node.value);
|
|
|
|
// If specific fields are provided, search fuzzy term across those fields
|
|
if (searchFields && searchFields.length > 0) {
|
|
return {
|
|
$or: searchFields.map(field => ({
|
|
[field]: { $regex: regex, $options: 'i' }
|
|
}))
|
|
};
|
|
}
|
|
|
|
return { $regex: regex, $options: 'i' };
|
|
}
|
|
|
|
/**
|
|
* Convert Lucene wildcards to MongoDB regex patterns
|
|
*/
|
|
private luceneWildcardToRegex(wildcardPattern: string): string {
|
|
// Replace Lucene wildcards with regex equivalents
|
|
// * => .*
|
|
// ? => .
|
|
// Also escape regex special chars
|
|
return wildcardPattern
|
|
.replace(/([.+^${}()|\\])/g, '\\$1') // Escape regex special chars
|
|
.replace(/\*/g, '.*')
|
|
.replace(/\?/g, '.');
|
|
}
|
|
|
|
/**
|
|
* Create a simplified fuzzy search regex
|
|
*/
|
|
private createFuzzyRegex(term: string): string {
|
|
// For a very simple approach, we allow some characters to be optional
|
|
let regex = '';
|
|
for (let i = 0; i < term.length; i++) {
|
|
// Make every other character optional (simplified fuzzy)
|
|
if (i % 2 === 1) {
|
|
regex += term[i] + '?';
|
|
} else {
|
|
regex += term[i];
|
|
}
|
|
}
|
|
return regex;
|
|
}
|
|
|
|
/**
|
|
* Parse string values to appropriate types (numbers, dates, etc.)
|
|
*/
|
|
private parseValue(value: string): any {
|
|
// Try to parse as number
|
|
if (/^-?\d+$/.test(value)) {
|
|
return parseInt(value, 10);
|
|
}
|
|
|
|
if (/^-?\d+\.\d+$/.test(value)) {
|
|
return parseFloat(value);
|
|
}
|
|
|
|
// Try to parse as date (simplified)
|
|
const date = new Date(value);
|
|
if (!isNaN(date.getTime())) {
|
|
return date;
|
|
}
|
|
|
|
// Default to string
|
|
return value;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main adapter class
|
|
*/
|
|
export class SmartdataLuceneAdapter {
|
|
private parser: LuceneParser;
|
|
private transformer: LuceneToMongoTransformer;
|
|
private defaultSearchFields: string[] = [];
|
|
|
|
/**
|
|
* @param defaultSearchFields - Optional array of field names to search across when no field is specified
|
|
*/
|
|
constructor(defaultSearchFields?: string[]) {
|
|
this.parser = new LuceneParser();
|
|
this.transformer = new LuceneToMongoTransformer();
|
|
if (defaultSearchFields) {
|
|
this.defaultSearchFields = defaultSearchFields;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert a Lucene query string to a MongoDB query object
|
|
* @param luceneQuery - The Lucene query string to convert
|
|
* @param searchFields - Optional array of field names to search across (overrides defaultSearchFields)
|
|
*/
|
|
convert(luceneQuery: string, searchFields?: string[]): any {
|
|
try {
|
|
// Parse the Lucene query into an AST
|
|
const ast = this.parser.parse(luceneQuery);
|
|
|
|
// Use provided searchFields, fall back to defaultSearchFields
|
|
const fieldsToSearch = searchFields || this.defaultSearchFields;
|
|
|
|
// Transform the AST to a MongoDB query
|
|
return this.transformWithFields(ast, fieldsToSearch);
|
|
} catch (error) {
|
|
throw new Error(`Failed to convert Lucene query: ${error}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Helper method to transform the AST with field information
|
|
*/
|
|
private transformWithFields(node: AnyQueryNode, searchFields: string[]): any {
|
|
// For term nodes without a specific field, apply the search fields
|
|
if (node.type === 'TERM') {
|
|
return this.transformer.transform(node, searchFields);
|
|
}
|
|
|
|
// For other node types, use the standard transformation
|
|
return this.transformer.transform(node);
|
|
}
|
|
} |