"""
QryParser is an embarrassingly simplistic query parser.
"""

# Copyright (c) 2025, Carnegie Mellon University.  All Rights Reserved.

import sys
import PyLu

from Idx import Idx
from QryIopSyn import QryIopSyn
from QryIopTerm import QryIopTerm
from QrySopOr import QrySopOr
from QrySopScore import QrySopScore
from QrySopSum import QrySopSum
from QrySopWsum import QrySopWsum


class QryParser:
    """
    QryParser is an embarrassingly simplistic query parser.  It has
    two primary methods:  getQuery and tokenizeString.  getQuery
    converts a query string into an optimized Qry tree.  tokenizeString
    converts a flat (unstructured) query string into a string array; it
    is used for creating learning-to-rank feature vectors.

    Add new operators to the query parser by modifying the following
    methods:

        createOperator: Use a string (e.g., #and) to create a node
        (e.g., QrySopAnd).

        parseString:  If the operator supports term weights
        (e.g., #wsum (0.5 apple 1 pie)), you must modify this method.
        For these operators, two substrings (weight and term) are
        popped from the query string at each step, instead of one.

    Add new document fields to the parser by modifying createTerms.
    """

    # -------------- Constants and variables --------------- #

    __ANALYZER = PyLu.LEnglishAnalyzerConfigurable()
    __initialized = False

    # -------------- Methods (alphabetical) ---------------- #

    @staticmethod
    def __init():
        """Initialize the Lucene analyzer."""
        if not QryParser.__initialized:
            QryParser.__ANALYZER.setLowercase(PyLu.JTrue)
            QryParser.__ANALYZER.setStopwordRemoval(PyLu.JTrue) 
            QryParser.__ANALYZER.setStemmer(PyLu.LEnglishAnalyzerConfigurableStemmerType.KSTEM)
            QryParser.__initialized = True


    @staticmethod
    def __createOperator(operatorName):
        """Create and return the specified query operator."""

        operator = None
        operatorNameLowerCase = operatorName.lower()

        # STUDENTS:
        # Add new query operators below.

        # Create the query operator.
        if operatorNameLowerCase == '#or':
            operator = QrySopOr()
        elif operatorNameLowerCase == '#syn':
            operator = QryIopSyn()
        else:
            QryParser.__syntaxError('Unknown query operator ' + operatorName)

        operator.setDisplayName(operatorName.upper())

        return operator

  
    @staticmethod
    def __createTerms(token):
        """
        Create one or more terms from a token.  The token may contain
        dashes or other punctuation b(e.g., near-death) and/or a field
        name (e.g., apple.title).

        token: The token consumed from the query string.

        Returns a list of one or more tokens.

        Throws IOException: Error accessing the Lucene index.
        """

        # Split the token into a term and a field.
        if '.' in token:
            term, field = token.split('.', 1)
            field = field.lower()
        else:
            term = token
            field = 'body'

        # Confirm that the field is a known field.
        if not (field == 'body' or
                field == 'title' or
                field == 'url' or 
                field == 'keywords' or
                field == 'inlink'):
            QryParser.__syntaxError('Unknown field ' + token)

        # Lexical processing, stopwords, stemming.  A loop is used
        # just in case a term (e.g., "near-death") gets tokenized into
        # multiple terms (e.g., "near" and "death").

        t = QryParser.tokenizeString(term)
        terms = [ None ] * len(t)
    
        for j in range(0, len(t)):
            terms[j] = QryIopTerm(t[j], field)
    
        return(terms)


    @staticmethod
    def _duplicateQry(q1, q2):
        """
        Indicate whether two query trees are duplicates. This simple
        function requires query arguments to be in the same order.
        """
        if type(q1) != type(q2):
            return(False)

        if type(q1) is QryIopTerm:
            return(q1._term == q2._term and q1._field == q2._field)

        if len(q1._args) != len(q2._args):
            return(False)

        for i in range(len(q1._args)):
            if not QryParser._duplicateQry(q1._args[i], q2._args[i]):
                return(False)

        return(True)

        
    @staticmethod
    def getQuery(queryString):
        """
        Parse a query string into a query tree.

        queryString: The query string, in an Indri-style query language.

        Returns:  The query tree for the parsed query.

        throws IOException: Error accessing the Lucene index.
        throws IllegalArgumentException: Query syntax error.
        """

        QryParser.__init()
        q = QryParser.parseString(queryString)	# An exact parse
        q = QryParser.optimizeQuery(q)		# An optimized parse
        return(q)


    @staticmethod
    def __indexOfBalancingParen(s):
        """
        Get the index of the right parenenthesis that balances the
        left-most parenthesis.  Return -1 if it doesn't exist.

        s: A string containing a query.
        """

        depth = 0

        for i in range(0, len(s)):
            if s[ i ] == '(':
                depth += 1
            elif s[ i ] == ')':
                if depth == 0:
                    QryParser.__syntaxError(
                        'Unbalanced or missing parentheses')
                depth -= 1
                if depth == 0:
                    return i

        return(-1)


    @staticmethod
    def optimizeQuery(q):
        """
        Optimize the query by removing degenerate nodes produced during
        query parsing, for example '#NEAR/1 (of the)' which turns into
        '#NEAR/1 ()' after stopwords are removed; and unnecessary nodes
        or subtrees, such as #AND (#AND (a)), which can be replaced by
        'a'.
        """

        # Term operators don't benefit from optimization.
        if isinstance(q, QryIopTerm):
            return q

        # Optimization is a depth-first task, so recurse on query
        # arguments.  This is done in reverse to simplify deleting
        # query arguments that become None.
        for i in range(len(q._args) - 1, -1, -1):
            q_i_before = q._args[ i ]
            q_i_after = QryParser.optimizeQuery(q_i_before)

            if q_i_after == None:
                q.delArg(i)			# optimization deleted the arg
            else:
                if q_i_before != q_i_after:
                    q._args[ i ] = q_i_after	# optimization changed the arg

        # If the operator now has no arguments, it is deleted.
        if len(q._args) == 0:
            return None

        # Convert #SUM to WSUM where appropriate.
        q = QryParser._optimizeSumToWsum(q)

        # Only SCORE operators can have a single argument.  Other
        # query operators that have just one argument are deleted.
        if len(q._args)  == 1 and not isinstance(q, QrySopScore):
            q = q._args[ 0 ]

        return(q)

    @staticmethod
    def _optimizeSumToWsum(q):
        """
        Anywhere in this query, convert a #SUM with duplicate arguments
        into a #WSUM.
        """

        # This code should really be in the Qry class, because it uses
        # knowledge of Qry class variables that should be private.

        # Depth-first processing.
        if type(q) is QryIopTerm:
            return(q)
        
        for i in range(len(q._args)):
            q._args[i] = QryParser._optimizeSumToWsum(q._args[i])

        # Check whether any #SUM arguments can have weights greater
        # than 1.
        if type(q) is not QrySopSum:
            return(q)
        
        weights = [1] * len(q._args)
        for i in range(len(q._args)):
            if weights[i] == 0:
                continue		# Ignore a known duplicate

            for j in range(i+1, len(q._args)):
                if weights[j] == 0:
                    continue		# Ignore a known duplicate

                if QryParser._duplicateQry(q._args[i], q._args[j]):
                    weights[j] = 0
                    weights[i] += 1

        # If all weights are still 1, there are no duplicates, so
        # the original #SUM is fine.  Otherwise, generate a #WSUM.
        if 0 not in weights:
            return(q)
            
        q_wsum = QrySopWsum()
        q_wsum.setDisplayName('#WSUM')
        for i in range(len(q._args)):
            if weights[i] != 0:
                q_wsum.appendArg(q._args[i], weights[i])
        return(q_wsum)

        
    @staticmethod
    def parseString(queryString):
        """
        Parse a query string into a query tree.

        queryString: The query string, in an Indri-style query language.

        Returns The query tree for the parsed query.

        throws IOException: Error accessing the Lucene index.
        throws IllegalArgumentException: Query syntax error.
        """

        # This simple parser is sensitive to parenthensis placement,
        # so check for basic errors first.

        queryString = queryString.strip()	# The last char should be ')'

        if (queryString.count('(') == 0 or
            queryString.count('(') != queryString.count(')') or
            QryParser.__indexOfBalancingParen(queryString) !=
            (len(queryString) - 1)):
            QryParser.__syntaxError(
                'Missing, unbalanced, or misplaced parentheses ')

        # The query language is prefix-oriented, so the query string can
        # be processed left to right.  At each step, a substring is
        # popped from the head (left) of the string, and is converted to
        # a Qry object that is added to the query tree.  Subqueries are
        # handled via recursion.

        # Find the left-most query operator and start the query tree.
        substrings = queryString.split('(', 1)
        queryTree = QryParser.__createOperator(substrings[0].strip())

        # Start consuming queryString by removing the query operator and
        # its terminating ')'.  queryString is always the part of the
        # query that hasn't been processed yet.
        queryString = substrings[ 1 ]
        queryString = queryString[ 0 :
                                   queryString.rindex(')') ].strip()
        
        # Each pass below handles one argument to the query operator.
        # Note: An argument can be a token that produces multiple terms
        # (e.g., "near-death") or a subquery (e.g., "#and (a b c)").
        # Recurse on subqueries.
        while len(queryString) > 0:

	    # STUDENTS:
            # In HW2, you will need to modify this to handle query
            # operators that have weights, e.g., #SUM(0.5 a 1.5 b)

            qargs = []

            if queryString[ 0 ] == '#':		# Subquery
                subquery, queryString = QryParser.__popSubquery (queryString);
                qargs = [ QryParser.parseString (subquery) ]
            else:				# Term
                term, queryString = QryParser.__popTerm (queryString);
                qargs = QryParser.__createTerms(term)

            queryString = queryString.strip()

            # Add the argument(s) to the query tree.
            for q in qargs:
                queryTree.appendArg(q)

        return queryTree;

    
    @staticmethod 
    def __popSubquery(argString):
        """
        Remove a subQuery from an argument string.  Return the subquery
        and the modified argument string.
        
        argString: A partial query argument string, e.g., "#and(a b) c d".
        
        Returns a tuple of the subquery string and the modified argString
        (e.g., (#and(a b)" and "c d).
        """
	
        i = QryParser.__indexOfBalancingParen(argString)
	  
        if i < 0:			# Query syntax error. The parser
            return(argString, '')	# handles it. Here, just don't fail.
    
        return(argString[ 0 : i+1 ], argString[ i+1 : ])

    
    @staticmethod
    def __popTerm(argString):
        """
        Remove a term from an argument string.  Return the term and
        the modified argument string.

        argString: A partial query argument string, e.g., "a b c d".

        Returns a tuple of the term string and the modified argString
        (e.g., "a" and "b c d".
        """
	
        substrings = argString.split(maxsplit=1)

        if len(substrings) < 2:	# Is this the last argument?
            argString = ''
        else:
            argString = substrings[1]

        return((substrings[ 0 ], argString))

    
    @staticmethod
    def __popWeight(argString):
        """
        Remove a weight from an argument string.  Return the weight and
        the modified argument string.

        argString: A partial query argument string, e.g., '3.0 fu 2.0 bar'.
        Returns the weight and the modified argString
        (e.g., 3.0 and 'fu 2.0 bar'.
        """

        substrings = argString.split(maxsplit=1)

        if len(substrings) < 2:
            syntaxError('Missing weight or query argument')

        return(float(substrings[ 0 ]), substrings[ 1 ])

    
    @staticmethod
    def __syntaxError(errorString):
        """
        Throw an error specialized for query parsing syntax errors.

        errorString: The error string.

        throws Exception: The query contained a syntax error
        """

        raise Exception('Query syntax error: ' + errorString)


    @staticmethod
    def tokenizeString(query):
        """                      
        Given part of a query string, returns an array of terms with
        stopwords removed and the terms stemmed using the Krovetz
        stemmer.  Use this method to process raw query terms.

        query: String containing query. 

        Returns a list of query tokens

        throws IOException: Error accessing the Lucene index.
        """

        QryParser.__init()

        tokenStream = QryParser.__ANALYZER.tokenStream(PyLu.JString('dummyField'),
                                                        PyLu.JString(query))
        charTermAttribute = tokenStream.addAttribute(PyLu.LCharTermAttribute)
        tokenStream.reset()

        tokens = []

        while tokenStream.incrementToken():
            term = charTermAttribute.toString()
            tokens.append(str(term))

        tokenStream.close()

        return(tokens)