smartdata/ts/classes.lucene.adapter.ts

643 lines
17 KiB
TypeScript

/**
* Lucene to MongoDB query adapter for SmartData
*/
import * as plugins from './plugins.js';
// Types
type NodeType = 'TERM' | 'PHRASE' | 'FIELD' | 'AND' | 'OR' | 'NOT' | 'RANGE' | 'WILDCARD' | 'FUZZY' | 'GROUP';
interface QueryNode {
type: NodeType;
}
interface TermNode extends QueryNode {
type: 'TERM';
value: string;
boost?: number;
}
interface PhraseNode extends QueryNode {
type: 'PHRASE';
value: string;
proximity?: number;
}
interface FieldNode extends QueryNode {
type: 'FIELD';
field: string;
value: AnyQueryNode;
}
interface BooleanNode extends QueryNode {
type: 'AND' | 'OR' | 'NOT';
left: AnyQueryNode;
right: AnyQueryNode;
}
interface RangeNode extends QueryNode {
type: 'RANGE';
field: string;
lower: string;
upper: string;
includeLower: boolean;
includeUpper: boolean;
}
interface WildcardNode extends QueryNode {
type: 'WILDCARD';
value: string;
}
interface FuzzyNode extends QueryNode {
type: 'FUZZY';
value: string;
maxEdits: number;
}
interface GroupNode extends QueryNode {
type: 'GROUP';
value: AnyQueryNode;
}
type AnyQueryNode = TermNode | PhraseNode | FieldNode | BooleanNode | RangeNode | WildcardNode | FuzzyNode | GroupNode;
/**
* Lucene query parser
*/
export class LuceneParser {
private pos: number = 0;
private input: string = '';
private tokens: string[] = [];
constructor() {}
/**
* Parse a Lucene query string into an AST
*/
parse(query: string): AnyQueryNode {
this.input = query.trim();
this.pos = 0;
this.tokens = this.tokenize(this.input);
return this.parseQuery();
}
/**
* Tokenize the input string into tokens
*/
private tokenize(input: string): string[] {
const specialChars = /[()\[\]{}"~^:]/;
const operators = /AND|OR|NOT|TO/;
let tokens: string[] = [];
let current = '';
let inQuote = false;
for (let i = 0; i < input.length; i++) {
const char = input[i];
// Handle quoted strings
if (char === '"') {
if (inQuote) {
tokens.push(current + char);
current = '';
inQuote = false;
} else {
if (current) tokens.push(current);
current = char;
inQuote = true;
}
continue;
}
if (inQuote) {
current += char;
continue;
}
// Handle whitespace
if (char === ' ' || char === '\t' || char === '\n') {
if (current) {
tokens.push(current);
current = '';
}
continue;
}
// Handle special characters
if (specialChars.test(char)) {
if (current) {
tokens.push(current);
current = '';
}
tokens.push(char);
continue;
}
current += char;
// Check if current is an operator
if (operators.test(current) &&
(i + 1 === input.length || /\s/.test(input[i + 1]))) {
tokens.push(current);
current = '';
}
}
if (current) tokens.push(current);
return tokens;
}
/**
* Parse the main query expression
*/
private parseQuery(): AnyQueryNode {
const left = this.parseBooleanOperand();
if (this.pos < this.tokens.length) {
const token = this.tokens[this.pos];
if (token === 'AND' || token === 'OR') {
this.pos++;
const right = this.parseQuery();
return {
type: token as 'AND' | 'OR',
left,
right
};
} else if (token === 'NOT' || token === '-') {
this.pos++;
const right = this.parseQuery();
return {
type: 'NOT',
left,
right
};
}
}
return left;
}
/**
* Parse boolean operands (terms, phrases, fields, groups)
*/
private parseBooleanOperand(): AnyQueryNode {
if (this.pos >= this.tokens.length) {
throw new Error('Unexpected end of input');
}
const token = this.tokens[this.pos];
// Handle grouping with parentheses
if (token === '(') {
this.pos++;
const group = this.parseQuery();
if (this.pos < this.tokens.length && this.tokens[this.pos] === ')') {
this.pos++;
return { type: 'GROUP', value: group } as GroupNode;
} else {
throw new Error('Unclosed group');
}
}
// Handle fields (field:value)
if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === ':') {
const field = token;
this.pos += 2; // Skip field and colon
if (this.pos < this.tokens.length) {
const value = this.parseBooleanOperand();
return { type: 'FIELD', field, value } as FieldNode;
} else {
throw new Error('Expected value after field');
}
}
// Handle range queries
if (token === '[' || token === '{') {
return this.parseRange();
}
// Handle phrases ("term term")
if (token.startsWith('"') && token.endsWith('"')) {
const phrase = token.slice(1, -1);
this.pos++;
// Check for proximity operator
let proximity: number | undefined;
if (this.pos < this.tokens.length && this.tokens[this.pos] === '~') {
this.pos++;
if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
proximity = parseInt(this.tokens[this.pos], 10);
this.pos++;
} else {
throw new Error('Expected number after proximity operator');
}
}
return { type: 'PHRASE', value: phrase, proximity } as PhraseNode;
}
// Handle wildcards
if (token.includes('*') || token.includes('?')) {
this.pos++;
return { type: 'WILDCARD', value: token } as WildcardNode;
}
// Handle fuzzy searches
if (this.pos + 1 < this.tokens.length && this.tokens[this.pos + 1] === '~') {
const term = token;
this.pos += 2; // Skip term and tilde
let maxEdits = 2; // Default
if (this.pos < this.tokens.length && /^\d+$/.test(this.tokens[this.pos])) {
maxEdits = parseInt(this.tokens[this.pos], 10);
this.pos++;
}
return { type: 'FUZZY', value: term, maxEdits } as FuzzyNode;
}
// Simple term
this.pos++;
return { type: 'TERM', value: token } as TermNode;
}
/**
* Parse range queries
*/
private parseRange(): RangeNode {
const includeLower = this.tokens[this.pos] === '[';
const includeUpper = this.tokens[this.pos + 4] === ']';
this.pos++; // Skip open bracket
if (this.pos + 4 >= this.tokens.length) {
throw new Error('Invalid range query syntax');
}
const lower = this.tokens[this.pos];
this.pos++;
if (this.tokens[this.pos] !== 'TO') {
throw new Error('Expected TO in range query');
}
this.pos++;
const upper = this.tokens[this.pos];
this.pos++;
if (this.tokens[this.pos] !== (includeLower ? ']' : '}')) {
throw new Error('Invalid range query closing bracket');
}
this.pos++;
// For simplicity, assuming the field is handled separately
return {
type: 'RANGE',
field: '', // This will be filled by the field node
lower,
upper,
includeLower,
includeUpper
};
}
}
/**
* Transformer for Lucene AST to MongoDB query
*/
export class LuceneToMongoTransformer {
constructor() {}
/**
* Transform a Lucene AST node to a MongoDB query
*/
transform(node: AnyQueryNode, searchFields?: string[]): any {
switch (node.type) {
case 'TERM':
return this.transformTerm(node, searchFields);
case 'PHRASE':
return this.transformPhrase(node, searchFields);
case 'FIELD':
return this.transformField(node);
case 'AND':
return this.transformAnd(node);
case 'OR':
return this.transformOr(node);
case 'NOT':
return this.transformNot(node);
case 'RANGE':
return this.transformRange(node);
case 'WILDCARD':
return this.transformWildcard(node, searchFields);
case 'FUZZY':
return this.transformFuzzy(node, searchFields);
case 'GROUP':
return this.transform(node.value, searchFields);
default:
throw new Error(`Unsupported node type: ${(node as any).type}`);
}
}
/**
* Transform a term to MongoDB query
*/
private transformTerm(node: TermNode, searchFields?: string[]): any {
// If specific fields are provided, search across those fields
if (searchFields && searchFields.length > 0) {
// Create an $or query to search across multiple fields
return {
$or: searchFields.map(field => ({
[field]: { $regex: node.value, $options: 'i' }
}))
};
}
// Otherwise, use text search (requires a text index on desired fields)
return { $text: { $search: node.value } };
}
/**
* Transform a phrase to MongoDB query
*/
private transformPhrase(node: PhraseNode, searchFields?: string[]): any {
// If specific fields are provided, search phrase across those fields
if (searchFields && searchFields.length > 0) {
// Create an $or query to search phrase across multiple fields
return {
$or: searchFields.map(field => ({
[field]: { $regex: `${node.value.replace(/\s+/g, '\\s+')}`, $options: 'i' }
}))
};
}
// For phrases, we use a regex to ensure exact matches
return { $text: { $search: `"${node.value}"` } };
}
/**
* Transform a field query to MongoDB query
*/
private transformField(node: FieldNode): any {
// Handle special case for range queries on fields
if (node.value.type === 'RANGE') {
const rangeNode = node.value as RangeNode;
rangeNode.field = node.field;
return this.transformRange(rangeNode);
}
// Handle special case for wildcards on fields
if (node.value.type === 'WILDCARD') {
return {
[node.field]: {
$regex: this.luceneWildcardToRegex((node.value as WildcardNode).value),
$options: 'i'
}
};
}
// Handle special case for fuzzy searches on fields
if (node.value.type === 'FUZZY') {
return {
[node.field]: {
$regex: this.createFuzzyRegex((node.value as FuzzyNode).value),
$options: 'i'
}
};
}
// Special case for exact term matches on fields
if (node.value.type === 'TERM') {
return { [node.field]: (node.value as TermNode).value };
}
// Special case for phrase matches on fields
if (node.value.type === 'PHRASE') {
return {
[node.field]: {
$regex: `^${(node.value as PhraseNode).value}$`,
$options: 'i'
}
};
}
// For other cases, we'll transform the value and apply it to the field
const transformedValue = this.transform(node.value);
// If the transformed value uses $text, we need to adapt it for the field
if (transformedValue.$text) {
return { [node.field]: transformedValue.$text.$search };
}
return { [node.field]: transformedValue };
}
/**
* Transform AND operator to MongoDB query
*/
private transformAnd(node: BooleanNode): any {
return { $and: [this.transform(node.left), this.transform(node.right)] };
}
/**
* Transform OR operator to MongoDB query
*/
private transformOr(node: BooleanNode): any {
return { $or: [this.transform(node.left), this.transform(node.right)] };
}
/**
* Transform NOT operator to MongoDB query
*/
private transformNot(node: BooleanNode): any {
const leftQuery = this.transform(node.left);
const rightQuery = this.transform(node.right);
// Create a query that includes left but excludes right
if (rightQuery.$text) {
// Text searches need special handling for negation
return {
$and: [
leftQuery,
{ $not: rightQuery }
]
};
} else {
// For other queries, we can use $not directly
return {
$and: [
leftQuery,
{ $not: rightQuery }
]
};
}
}
/**
* Transform range query to MongoDB query
*/
private transformRange(node: RangeNode): any {
const range: any = {};
if (node.lower !== '*') {
range[node.includeLower ? '$gte' : '$gt'] = this.parseValue(node.lower);
}
if (node.upper !== '*') {
range[node.includeUpper ? '$lte' : '$lt'] = this.parseValue(node.upper);
}
return { [node.field]: range };
}
/**
* Transform wildcard query to MongoDB query
*/
private transformWildcard(node: WildcardNode, searchFields?: string[]): any {
// Convert Lucene wildcards to MongoDB regex
const regex = this.luceneWildcardToRegex(node.value);
// If specific fields are provided, search wildcard across those fields
if (searchFields && searchFields.length > 0) {
return {
$or: searchFields.map(field => ({
[field]: { $regex: regex, $options: 'i' }
}))
};
}
// By default, apply to all text fields using $text search
return { $regex: regex, $options: 'i' };
}
/**
* Transform fuzzy query to MongoDB query
*/
private transformFuzzy(node: FuzzyNode, searchFields?: string[]): any {
// MongoDB doesn't have built-in fuzzy search
// This is a very basic approach using regex
const regex = this.createFuzzyRegex(node.value);
// If specific fields are provided, search fuzzy term across those fields
if (searchFields && searchFields.length > 0) {
return {
$or: searchFields.map(field => ({
[field]: { $regex: regex, $options: 'i' }
}))
};
}
return { $regex: regex, $options: 'i' };
}
/**
* Convert Lucene wildcards to MongoDB regex patterns
*/
private luceneWildcardToRegex(wildcardPattern: string): string {
// Replace Lucene wildcards with regex equivalents
// * => .*
// ? => .
// Also escape regex special chars
return wildcardPattern
.replace(/([.+^${}()|\\])/g, '\\$1') // Escape regex special chars
.replace(/\*/g, '.*')
.replace(/\?/g, '.');
}
/**
* Create a simplified fuzzy search regex
*/
private createFuzzyRegex(term: string): string {
// For a very simple approach, we allow some characters to be optional
let regex = '';
for (let i = 0; i < term.length; i++) {
// Make every other character optional (simplified fuzzy)
if (i % 2 === 1) {
regex += term[i] + '?';
} else {
regex += term[i];
}
}
return regex;
}
/**
* Parse string values to appropriate types (numbers, dates, etc.)
*/
private parseValue(value: string): any {
// Try to parse as number
if (/^-?\d+$/.test(value)) {
return parseInt(value, 10);
}
if (/^-?\d+\.\d+$/.test(value)) {
return parseFloat(value);
}
// Try to parse as date (simplified)
const date = new Date(value);
if (!isNaN(date.getTime())) {
return date;
}
// Default to string
return value;
}
}
/**
* Main adapter class
*/
export class SmartdataLuceneAdapter {
private parser: LuceneParser;
private transformer: LuceneToMongoTransformer;
private defaultSearchFields: string[] = [];
/**
* @param defaultSearchFields - Optional array of field names to search across when no field is specified
*/
constructor(defaultSearchFields?: string[]) {
this.parser = new LuceneParser();
this.transformer = new LuceneToMongoTransformer();
if (defaultSearchFields) {
this.defaultSearchFields = defaultSearchFields;
}
}
/**
* Convert a Lucene query string to a MongoDB query object
* @param luceneQuery - The Lucene query string to convert
* @param searchFields - Optional array of field names to search across (overrides defaultSearchFields)
*/
convert(luceneQuery: string, searchFields?: string[]): any {
try {
// Parse the Lucene query into an AST
const ast = this.parser.parse(luceneQuery);
// Use provided searchFields, fall back to defaultSearchFields
const fieldsToSearch = searchFields || this.defaultSearchFields;
// Transform the AST to a MongoDB query
return this.transformWithFields(ast, fieldsToSearch);
} catch (error) {
throw new Error(`Failed to convert Lucene query: ${error}`);
}
}
/**
* Helper method to transform the AST with field information
*/
private transformWithFields(node: AnyQueryNode, searchFields: string[]): any {
// For term nodes without a specific field, apply the search fields
if (node.type === 'TERM') {
return this.transformer.transform(node, searchFields);
}
// For other node types, use the standard transformation
return this.transformer.transform(node);
}
}