Source code for heritage.heritage

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Python Interface to The Sanskrit Heritage Site

Use The Sanskrit Heritage Platform using,

* Web mirror
  - no installation required
  - makes HTTP requests
* Local installation
  - faster
  - uses console
  - no HTTP requests required

Using Local Installation
------------------------
- Heritage_Platform/ML/ contains the scripts
- export QUERY_STRING as shell variable
  (referred to as OPTION_STRING in this code alongwith the '&text=TEXT' part)
- execute various scripts, such as ./reader
- still produces HTML output that needs to be parsed

# Default input needs to be in the devanagari format
# utils.devanagari_to_velthuis() function will convert this to VH
"""

###############################################################################

import os
import re
import time
import random
import logging
import functools
import subprocess
import urllib.parse
from dataclasses import dataclass, field
from typing import Dict, Optional

import requests
import bs4

from .constants import HERITAGE_COLOURS
from .models import (
    AnalysisCandidate,
    ConjugationCell,
    ConjugationTable,
    DeclensionTable,
    DictionaryEntry,
    SearchResult,
    SolutionAnalysis,
    WordAnalysis,
    WordRole,
)
from .utils import build_query_string, devanagari_to_velthuis


###############################################################################

###############################################################################

DEFAULT_REQUEST_TIMEOUT = 10
DEFAULT_REQUEST_ATTEMPTS = 3
_LATIN_FALLBACK_ENCODINGS = {"iso-8859-1", "latin1", "latin-1"}

###############################################################################
# TODO: Do we need to use python-frozendict (PyPI)?


[docs]class frozendict(dict):
    def __hash__(self):
        return hash(frozenset(self.items()))


[docs]def freezeargs(func):
    """
    Transform mutable dictionnary arguments into immutable frozen ones

    Useful to be compatible with @cache. Should be added on top of @cache
    """

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        args = tuple(
            [frozendict(arg) if isinstance(arg, dict) else arg for arg in args]
        )
        kwargs = {
            k: frozendict(v) if isinstance(v, dict) else v
            for k, v in kwargs.items()
        }
        return func(*args, **kwargs)

    for method in ["cache_info", "cache_clear"]:
        if callable(getattr(func, method, None)):
            setattr(wrapper, method, getattr(func, method))

    return wrapper


###############################################################################


[docs]@dataclass
class HeritageAnalysis:
    case: str = field(default=None)
    number: str = field(default=None)
    gender: str = field(default=None)
    tense: str = field(default=None)


[docs]@dataclass
class Token:
    pass


###############################################################################


[docs]class HeritageOutput:
    """
    Heritage Output Parser

    Parse output generated by various utilities from Heritage Platform
    """

    CLASSES = {"footer": ["enpied"]}

    def __init__(self, html: str):
        self.logger = logging.getLogger(__name__)
        self.html = html
        self.soup = bs4.BeautifulSoup(html, "html.parser")
        self.process()

[docs]    def process(self, html: str = None):
        """Process the html and extract basic information"""
        # Allow re-using of the class
        if html is not None:
            self.html = html
            self.soup = bs4.BeautifulSoup(html, "html.parser")

        self.body = self.soup.find("body")
        if self.body is None:
            self.logger.error("No <body> tag found in HTML.")
            self.footer = None
            self.title = self.soup.find("title")
            self.inner_title = None
            self.blocks = []
            return

        self.footer = self.body.find("div", class_=self.CLASSES["footer"])

        # Extract Meta Information
        self.meta = {}
        for meta in self.soup.find_all("meta"):
            if meta.get("name", ""):
                self.meta[meta.get("name")] = meta.get("content", "")
            if meta.get("property", ""):
                self.meta[meta.get("property")] = meta.get("content", "")

        # Extract Title
        self.title = self.soup.find("title")
        self.inner_title = self.body.find("h1", class_="title")

        # Find Relevant Body Children
        self.blocks = self.body.find_all()

[docs]    def extract_analysis(
        self, meta: bool = False, structured: bool = False
    ):
        """
        Extract analysis from HTML

        Parameters
        ----------
        meta : bool
            If True, include meta information, i.e, parse options, classes
            The default is False.
        structured : bool
            If True, return dataclass-based representations.
            The default is False (legacy dictionaries).
        """
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if self.title.text != "Sanskrit Reader Companion":
            self.logger.error("Invalid output page.")
            return None

        hr_blocks = self.html.split("<hr>")
        if len(hr_blocks) < 2:
            self.logger.error("No solutions found.")
            return None

        solutions = {}
        for block in hr_blocks[2:]:
            if "Solution" not in block:
                break

            solution = {}

            soup = bs4.BeautifulSoup(block, "html.parser")
            first_span = soup.find("span")
            solution_id = int(first_span.text.split()[1])

            solution["id"] = solution_id
            solution["words"] = []

            if meta:
                parser_url = first_span.find("a")["href"]
                # TODO: Better parsing of options
                parser_options = dict(
                    [
                        e.split("=")
                        for e in re.split(
                            r"&amp;|&|;", parser_url.split("?")[1]
                        )
                    ]
                )
                solution["parser_options"] = parser_options

            tables = soup.find_all("table")
            current_text = None
            for table in tables:
                if table.find("table"):
                    prev = table.previous_sibling
                    current_text = (
                        prev.get_text()
                        if isinstance(prev, bs4.element.Tag)
                        else str(prev).strip()
                    )
                else:
                    # Inner table contains analysis and it occurs after
                    # the original word
                    analyses = self.parse_analysis(
                        table, structured=structured
                    )
                    css_classes = table.get("class", [])
                    if meta:
                        word_classes = css_classes
                    categories = [
                        HERITAGE_COLOURS.get(css_class.split("_back")[0], None)
                        for css_class in css_classes
                    ]
                    if structured:
                        solution["words"].append(
                            WordAnalysis(
                                text=current_text or "",
                                category=categories,
                                classes=css_classes,
                                candidates=analyses,
                            )
                        )
                    else:
                        word = {"text": current_text or ""}
                        if meta:
                            word["classes"] = word_classes
                        word["category"] = categories
                        word_analyses = []
                        for analysis in analyses:
                            word_copy = word.copy()
                            word_copy.update(analysis)
                            word_analyses.append(word_copy)
                        solution["words"].append(word_analyses)

            solutions[solution_id] = solution

        if structured:
            structured_solutions: Dict[int, SolutionAnalysis] = {}
            for solution_id, raw_solution in solutions.items():
                parser_options = (
                    raw_solution.get("parser_options")
                    if meta
                    else None
                )
                structured_solutions[solution_id] = SolutionAnalysis(
                    id=solution_id,
                    words=raw_solution["words"],
                    parser_options=parser_options,
                )
            return structured_solutions
        return solutions

[docs]    def extract_parse(self, structured: bool = False):
        """Extract parse from HTML"""
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if self.title.text != "Sanskrit Reader Assistant":
            self.logger.error("Invalid output page.")
            return None

        word_nodes = self.soup.find_all("table", class_="yellow_back")
        roles = []
        for word_node in word_nodes:
            word_text = word_node.get_text().strip()
            word_row = word_node.find_parent("tr")
            tables = word_row.find_all("table")
            # analysis_table = tables[1]
            # word_id_table = tables[2]
            semantic_table = tables[3]
            semantic_rows = semantic_table.find_all("tr")
            word_roles = [row.get_text() for row in semantic_rows]
            if structured:
                roles.append(WordRole(text=word_text, roles=word_roles))
            else:
                roles.append({"text": word_text, "roles": word_roles})
        return roles

[docs]    def extract_declensions(
        self, headers: bool = True, structured: bool = False
    ):
        """
        Extract declension tables from HTML.

        When ``structured`` is True, returns a :class:`DeclensionTable`
        instance; otherwise returns a nested list of header/body cells.
        """
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if self.title.text != "Sanskrit Grammarian Declension Engine":
            self.logger.error("Invalid output page.")
            return None
        table = self.soup.find("table", class_="inflexion")
        if table is None:
            self.logger.error("Declension table not found in HTML.")
            return None
        rows = table.find_all("tr")
        output = []
        for row in rows:
            cols = [col.get_text(" ").split() for col in row.find_all("th")]
            output.append(cols)
        output = output[:2] + output[3:] + [output[2]]
        if not headers:
            output = [row[1:] for row in output[1:]]
        if structured:
            flattened = [
                [" ".join(cell).strip() for cell in row] for row in output
            ]
            if headers:
                return DeclensionTable(
                    headers=flattened[0],
                    rows=flattened[1:],
                )
            return DeclensionTable(headers=[], rows=flattened)
        return output

[docs]    def extract_conjugations(
        self, headers: bool = True, structured: bool = False
    ):
        """
        Extract conjugation tables from HTML.

        When ``structured`` is True, returns a list of
        :class:`ConjugationTable` objects; otherwise a nested dictionary
        keyed by table headings.
        """
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if self.title.text != "Sanskrit Grammarian Conjugation Engine":
            self.logger.error("Invalid output page.")
            return None
        tables = self.soup.find_all("table", class_="gris_cent")
        forms = {} if not structured else []
        for table in tables:
            header = table.find("span").get_text()
            if not structured:
                forms[header] = {}
            inner_tables = table.find_all("table", class_="inflexion")

            structured_cells = []
            for inner_table in inner_tables:
                rows = inner_table.find_all("tr")
                output = []
                for row in rows:
                    cols = [
                        col.get_text(" ").split() for col in row.find_all("th")
                    ]
                    output.append(cols)
                if structured:
                    heading = " ".join(output[0][0]).strip()
                    flattened_rows = [
                        [" ".join(cell).strip() for cell in row]
                        for row in (output[1:] if headers else output)
                    ]
                    structured_cells.append(
                        ConjugationCell(heading=heading, rows=flattened_rows)
                    )
                else:
                    forms[header][output[0][0][0]] = output

            if structured:
                forms.append(ConjugationTable(title=header, cells=structured_cells))

        return forms

[docs]    def extract_sandhi(self):
        """Extract Sandhi from HTML"""
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if self.title.text != "Sanskrit Sandhi Engine":
            self.logger.error("Invalid output page.")
            return None
        pattern = r"\s*([^\s\|]*)\s*\|\s*([^\s=]*)\s*=\s*([^\s]*)\s*"
        for span in self.body.find_all("span"):
            match = re.match(pattern, span.get_text(" "), flags=re.DOTALL)
            if match:
                return match.group(3)

[docs]    def extract_lexicon_entry(self, word_id: str):
        """Extract entry from a lexicon"""
        if not self.title or not self.title.text:
            self.logger.error("Missing or empty <title> tag.")
            return None
        if "Monier-Williams Sanskrit-English" not in self.title.text:
            self.logger.error("Invalid dictionary page.")
            return None
        marker = self.soup.find("a", attrs={"name": word_id})
        if marker is None:
            self.logger.error(
                "Dictionary entry with id '%s' not found.", word_id
            )
            return None
        parent = marker.find_parent()
        container = marker.find_parent("span")
        if container is None:
            container = parent
        lemma = marker.get_text(strip=True)
        if not lemma:
            italic = container.find("i")
            lemma = italic.get_text(strip=True) if italic else word_id
        entry_html = str(container)
        entry_text = container.get_text(" ", strip=True)
        return DictionaryEntry(lemma=lemma, html=entry_html, text=entry_text)

[docs]    def extract_search_results(self, structured: bool = True):
        """Extract dictionary search results."""
        result_table = self.soup.find("table")
        if result_table is None:
            self.logger.error("Could not locate results table.")
            return None
        results = []
        for row in result_table.find_all("tr"):
            cols = row.find_all(["td", "th"])
            if not cols:
                continue
            link_tag = cols[0].find("a")
            entry = (
                link_tag.get_text(strip=True)
                if link_tag
                else cols[0].get_text(strip=True)
            )
            if not entry:
                continue
            link = link_tag["href"] if link_tag else None
            summary_parts = [
                col.get_text(" ", strip=True) for col in cols[1:]
            ]
            summary = " ".join(part for part in summary_parts if part)
            if structured:
                results.append(
                    SearchResult(entry=entry, link=link, summary=summary)
                )
            else:
                results.append(
                    {
                        "entry": entry,
                        "link": link,
                        "summary": summary,
                    }
                )
        return results

[docs]    @staticmethod
    def parse_analysis(
        table: bs4.element.Tag, structured: bool = False
    ):
        """
        Parse analysis of a single word
        Analysis Format is: [root]{analysis_1 | analysis_2 | ..}

        Parameters
        ----------
        table : bs4.element.Tag
            Valid `table` element

        Returns
        -------
        analysies : list
        """
        # pattern = r'\[([^\]]*)\]\{([^\}]*)\}'
        pattern = r"\[(.*?)\]\{([^\}]*)\}"
        rows = table.find_all("tr")
        analyses = []
        logger = logging.getLogger(__name__)
        for row in rows:
            if row is None:
                continue

            link = row.find("a")
            if link is not None:
                link_parts = link["href"].split("/")[-1].split("#")
                file_name, word_id = link_parts[0], link_parts[1]
            else:
                file_name, word_id = None, None
            row_text = row.get_text().strip()
            match = re.match(pattern, row_text, flags=re.DOTALL)
            if match is None:
                logger.debug("Unable to parse analysis row: %s", row_text)
                continue
            parsed_analyses = [
                [abbrev.replace(".", "") for abbrev in an.split()]
                for an in match.group(2).split("|")
            ]
            if structured:
                analyses.append(
                    AnalysisCandidate(
                        root=match.group(1).split()[0].strip(),
                        analyses=parsed_analyses,
                        lexicon_reference=(file_name, word_id),
                    )
                )
            else:
                analyses.append(
                    {
                        "lexicon": (file_name, word_id),
                        "root": match.group(1).split()[0].strip(),
                        "analyses": parsed_analyses,
                    }
                )
        return analyses

    def __repr__(self):
        return repr(self.soup)


###############################################################################


[docs]class HeritagePlatform:
    """
    The Sanskrit Heritage Platform

    Access various utilities from The Sanskrit Heritage Platform
    """

    INRIA_URL = "https://sanskrit.inria.fr/cgi-bin/SKT/"
    ACTIONS = {
        "reader": {"shell": "reader", "web": "sktreader.cgi"},
        "parser": {"shell": "parser", "web": "sktparser.cgi"},
        "search": {"shell": "indexer", "web": "sktindex.cgi"},
        "search_easy": {"shell": "indexerd", "web": "sktsearch.cgi"},
        "declension": {"shell": "declension", "web": "sktdeclin.cgi"},
        "conjugation": {"shell": "conjugation", "web": "sktconjug.cgi"},
        "lemma": {"shell": "lemmatizer", "web": "sktlemmatizer.cgi"},
        "sandhi": {"shell": "sandhier", "web": "sktsandhier.cgi"},
        "user": {"shell": "user_aid", "web": "sktuser.cgi"},
        "interface": {"shell": "interface", "web": "sktgraph.cgi"},
        "dictionary": {"shell": "../MW/", "web": "../../MW/"},
    }

    OPTIONS = {
        "lex": {
            "description": "Lexicon",
            "values": {
                "MW": "Monier-Williams Dictionary (English)",
                "SH": "Sanskrit Heritage Dictionary (French)",
            },
            "default": "MW",
        },
        "font": {
            "description": "Font for Sanskrit output",
            "values": {"deva": "Devanagari", "roma": "Roman (IAST)"},
            "default": "deva",
        },
        "t": {
            "description": "Internal Transliteration Scheme",
            "values": {"VH": "Velthuis"},
            "default": "VH",
        },
    }

    METHODS = ["shell", "web"]
    DEFAULT_METHOD = "shell"

[docs]    def __init__(
        self,
        base_dir: str = "",
        base_url: str = None,
        method: str = "shell",
        **kwargs,
    ):
        """
        Initialize Heritage Class

        Parameters
        ----------
        base_dir : str
            Path to the Heritage_Platform repository.
            The directory should contain 'ML' sub-directory,
            which further contains the scripts
        base_url : str, optional
            URL for the Heritage Platform Mirror.
            If None, the official INRIA website will be used.
            The default is None.
        method : str, optional
            Method used to obtain results. Results can be obtained either using
            the web installation or using UNIX shell.

            Possible values are, 'shell' and 'web'
            The default is 'shell'.
        **kwargs :
            Additional configuration keywords. Supported values are:

            * ``request_timeout`` (int): timeout for HTTP requests in seconds.
            * ``request_attempts`` (int): number of HTTP retries before giving up.
        """
        self.logger = logging.getLogger(__name__)
        self.base_url = self.INRIA_URL if base_url is None else base_url
        self.base_dir = base_dir
        self.scripts_dir = os.path.join(self.base_dir, "ML")
        self.request_timeout = kwargs.pop(
            "request_timeout", DEFAULT_REQUEST_TIMEOUT
        )
        self.request_attempts = kwargs.pop(
            "request_attempts", DEFAULT_REQUEST_ATTEMPTS
        )

        self.method = None
        self.set_method(method)

        if not self.valid_installation():
            self.logger.warning(
                "Heritage Platform installation not found. "
                "Falling back to `method=\"web\"`."
            )
            self.base_dir = ""
            self.scripts_dir = ""
            self.set_method("web")

        self.options = {}
        for option in self.OPTIONS:
            self.options[option] = self.OPTIONS[option]["default"]

    ###########################################################################
    # Utilities (Actions)

[docs]    def get_analysis(
        self,
        input_text: str,
        sentence: bool = True,
        unsandhied: bool = False,
        meta: bool = False,
        structured: bool = True,
    ):
        """
        Obtain morphological analyses using The Sanskrit Reader Companion

        Parameters
        ----------
        input_text : str
            Input text to analyse
        sentence : bool, optional
            The input is treated as a sentence, if true, otherwise as a word.
            The default is True.
        unsandhied : bool, optional
            If True, the input text is assumed to not contain sandhi.
            The default is False.
        meta : bool, optional
            The option is passed to HeritageOutput.extract_analysis().
            The default is False.
        structured : bool, optional
            Return dataclass objects if True, otherwise legacy dictionaries.
            The default is True.

        Returns
        -------
        dict[int, SolutionAnalysis] | dict
            Dictionary of valid morphological analyses with solution_id as keys
        """

        opt_st = "t" if sentence else "f"
        opt_us = "t" if unsandhied else "f"

        options = {
            "lex": self.get_lexicon(),
            "cache": "t",  # Use Cache (t)rue, (f)alse
            "st": opt_st,  # Sentence (t)rue, Word (f)alse
            "us": opt_us,  # Unsandhied (t)rue, (f)alse
            # if 'us' is 'f', "ca eva" is parsed as "ca_eva",
            # "tathā eva" as "tathā_eva" etc.
            "cp": "t",  # Full Parser Strength (t)rue, (f)alse
            "t": self.get_option("t"),
            "mode": "p",  # Parse Mode (p)arsing, (t)agging
            # Tagging does not prune any solutions
            "font": self.get_font(),
            # Output Display Font (deva)nagari (roma)n
            "topic": "",
            "corpmode": "",
            "corpdir": "",
            "sentno": "",
            "text": self.prepare_input(input_text),
        }
        result = self.get_result("reader", options)
        if result is None:
            return None

        output = HeritageOutput(result)
        # return output
        return output.extract_analysis(meta=meta, structured=structured)

    # ----------------------------------------------------------------------- #

[docs]    def get_parse(
        self,
        input_text: str,
        solution_id: int = None,
        sentence: bool = True,
        unsandhied: bool = False,
    ):
        """
        Obtain parse of a sentence using The Sanskrit Reader Companion

        Parameters
        ----------
        input_text : str
            Input text to analyse
        solution_id : int, optional
            Solution ID to parse.
            If None, the first solution ID is used.
            The default is None.
        sentence : bool, optional
            The input is treated as a sentence, if true, otherwise as a word.
            The option is passed to HeritagePlatform.get_analysis().
            The default is True.
        unsandhied : bool, optional
            If True, the input text is assumed to not contain sandhi.
            The option is passed to HeritagePlatform.get_analysis().
            The default is False.

        Returns
        -------
        SolutionAnalysis | dict
            Parse of the sentence. By default a
            :class:`heritage.models.SolutionAnalysis` instance is returned,
            but legacy dictionary outputs are still supported when using the
            non-structured APIs.
        """
        solutions = self.get_analysis(
            input_text,
            sentence=sentence,
            unsandhied=unsandhied,
            meta=True,
            structured=True,
        )

        # If solution ID not provided, use the first solution
        if solution_id is None:
            if not solutions:
                return None  # TODO: Change this to something ?

            solution_id = next(iter(solutions))

        # No need to manually give options again, since it does it for us
        # Internally parser is a re-run of reader until a specific solution
        # Remove following block in later versions

        # opt_st = 't' if sentence else 'f'
        # opt_us = 't' if unsandhied else 'f'

        # options = {
        #     'lex': self.get_lexicon(),
        #     'cache': 't',  # Use Cache (t)rue, (f)alse
        #     'st': opt_st,  # Sentence (t)rue, Word (f)alse
        #     'us': opt_us,  # Unsandhied (t)rue, (f)alse
        #                    # if 'us' is 'f', "ca eva" is parsed as "ca_eva",
        #                    # "tathā eva" as "tathā_eva" etc.
        #     'cp': 't',     # Full Parser Strength (t)rue, (f)alse
        #     't': self.get_option('t'),
        #     'mode': 'p',   # Parse Mode (p)arse, (g)raph, (s)ummary
        #     'font': self.get_font(),
        #                    # Output Display Font (deva)nagari (roma)n
        #     'topic': '',
        #     'n': solution_id,
        #     'abs': 'f',     # TODO: Find out what this does
        #     'text': self.prepare_input(input_text)
        # }

        solution = solutions[solution_id]
        parser_options = solution.parser_options if isinstance(
            solution, SolutionAnalysis
        ) else solution["parser_options"]
        options = dict(parser_options or {})
        result = self.get_result("parser", options)
        if result is None:
            return None
        output = HeritageOutput(result)
        roles = output.extract_parse(structured=isinstance(solution, SolutionAnalysis))
        if isinstance(solution, SolutionAnalysis):
            solution.roles = roles
            return solution
        solution["roles"] = roles
        return solution

    # ----------------------------------------------------------------------- #

[docs]    def sandhi(self, word_1: str, word_2: str, mode: str = "internal"):
        """
        Join two words by forming a Sandhi

        Parameters
        ----------
        word_1 : str
            The first (left) word in the Sandhi
        word_2 : str
            The second (right) word in the Sandhi
        mode : str, optional
            Indicates whether the words join to form a single word or not
            Possible values are,
            * internal
            * external
            The default is 'internal'.

        Returns
        -------
        sandhi : str
            String obtained by forming the Sandhi
        """
        if mode not in ["internal", "external"]:
            self.logger.warning(f"Invalid mode: '{mode}'")

        options = {
            "lex": self.get_lexicon(),
            "l": self.prepare_input(word_1),
            "r": self.prepare_input(word_2),
            "t": self.get_option("t"),
            "k": mode,
            "font": self.get_font(),
        }
        result = self.get_result("sandhi", options)
        if result is None:
            return None
        output = HeritageOutput(result)

        return output.extract_sandhi()

    # ----------------------------------------------------------------------- #

[docs]    def search_inflected_form(self, word: str, category: str):
        """
        Search an inflected form

        Parameters
        ----------
        word : str
            Sanskrit Word to search (in Devanagari)
        category : str
            Type of the word
                * Noun: Noun
                * Pron: Pronoun
                * Part: Participle
                * Inde: Indeclinible
                * Absya, Abstvaa, Voca, Iic, Ifc, Iiv, Piic etc.
        Returns
        -------
        matches : list
            List of matches.
        """
        options = {
            "t": self.get_option("t"),
            "q": self.prepare_input(word),
            "c": category,
            "font": self.get_font(),
        }
        result = self.get_result("lemma", options)
        if result is None:
            return None
        output = HeritageOutput(result)

        # TODO: Output Parsing
        return output

    # ----------------------------------------------------------------------- #

[docs]    def get_declensions(
        self,
        word: str,
        gender: str,
        headers: bool = True,
        lexicon: str = None,
        structured: bool = True,
    ):
        """
        Retrieve declension tables from the Grammarian.

        Parameters
        ----------
        word : str
            Input word in Devanagari.
        gender : str
            Gender hint. Accepted values include short forms (``m``, ``f``,
            ``n``) and Sanskrit labels (e.g. ``पु``, ``स्त्री``).
        headers : bool, optional
            If ``True``, include header row information. The default is True.
        lexicon : str, optional
            Reserved for future use. Currently ignored.
        structured : bool, optional
            When ``True`` (the default), returns a
            :class:`heritage.models.DeclensionTable` instance. When ``False``,
            returns the raw nested list produced by
            :meth:`HeritageOutput.extract_declensions`.

        Returns
        -------
        DeclensionTable | list | None
            Structured table, legacy list-of-lists, or ``None`` when no table
            can be extracted.
        """
        options = {
            "lex": self.get_lexicon(),
            "t": self.get_option("t"),
            "q": self.prepare_input(word),
            "g": self.identify_gender(gender),
            "font": self.get_font(),
        }
        result = self.get_result("declension", options)
        if result is None:
            return None
        output = HeritageOutput(result)

        return output.extract_declensions(
            headers=headers, structured=structured
        )

    # ----------------------------------------------------------------------- #

[docs]    def get_conjugations(
        self,
        word: str,
        gana: str,
        lexicon: str = None,
        headers: bool = True,
        structured: bool = True,
    ):
        """
        Retrieve conjugation paradigms from the Grammarian.

        Parameters
        ----------
        word : str
            Verbal root in Devanagari.
        gana : str
            Verbal class (gaṇa) identifier expected by the backend.
        lexicon : str, optional
            Reserved for future use. Currently ignored.
        headers : bool, optional
            If ``True``, treat the first row of each table as a heading.
        structured : bool, optional
            When ``True`` (the default), returns a list of
            :class:`heritage.models.ConjugationTable` objects. When ``False``,
            returns the legacy dictionary-of-tables output.

        Returns
        -------
        list[ConjugationTable] | dict | None
            Structured tables, legacy mapping, or ``None`` on failure.
        """
        options = {
            "lex": self.get_lexicon(),
            "t": self.get_option("t"),
            "q": self.prepare_input(word),
            "c": gana,
            "font": self.get_font(),
        }
        result = self.get_result("conjugation", options)
        if result is None:
            return None
        output = HeritageOutput(result)

        return output.extract_conjugations(
            headers=headers, structured=structured
        )

    # ----------------------------------------------------------------------- #

[docs]    def search_lexicon(
        self, word: str, lexicon: str = None, structured: bool = True
    ):
        """Search a word in the dictionary.

        Parameters
        ----------
        word : str
            Sanskrit Word to search (in Devanagari)
        lexicon : str, optional
            Lexicon to search the word in.
            Possible values are,

              - MW: Monier-Williams Dictionary
              - SH: Heritage Dictionary

            The default is 'MW'.

        Returns
        -------
        list[SearchResult] | list[dict] | None
            Parsed search results (the default), legacy dictionaries when
            ``structured`` is False, or ``None`` when the backend response
            cannot be parsed.
        """
        options = {
            "lex": self.get_lexicon(),
            "t": self.get_option("t"),
            "q": self.prepare_input(word),
            "font": self.get_font(),
        }
        result = self.get_result("search", options)
        if result is None:
            return None
        output = HeritageOutput(result)

        # TODO: Currently not using the lexicon keyword argument
        # Is there any use for that argument? For this function?
        return output.extract_search_results(structured=structured)

    ###########################################################################

[docs]    @functools.lru_cache(maxsize=None)
    def get_lexicon_entry(self, file_name: str, word_id: str):
        """
        Fetch a single dictionary entry by its file and anchor identifier.

        The implementation reuses the same HTML parser used for direct search
        results and returns a :class:`heritage.models.DictionaryEntry`
        instance.

        Parameters
        ----------
        file_name : str
            Name of the HTML file containing the entry.
        word_id : str
            Anchor identifier within the dictionary page.

        Returns
        -------
        DictionaryEntry | None
            Parsed entry when available, otherwise ``None``.
        """
        if self.method == "shell":
            path = self.get_path("dictionary")
            file_path = os.path.join(path, file_name)
            with open(file_path, encoding="utf-8") as f:
                content = f.read()
        elif self.method == "web":
            url = self.get_url("dictionary")
            query_url = f"{url}{file_name}#{word_id}"
            content = self.__get(
                query_url, self.request_attempts, self.request_timeout
            )
        else:
            self.logger.error(f"Invalid method: '{self.method}'.")
            return

        if content is None:
            return None

        output = HeritageOutput(content)
        return output.extract_lexicon_entry(word_id)

    ###########################################################################
    # Fetch Result through Web or Shell

[docs]    def get_result_from_web(
        self,
        url: str,
        options: dict,
        attempts: int = None,
        timeout: int = None,
    ):
        """
        Get results from the Heritage Platform web mirror
        Exponential backoff is used in case there are network errors

        Parameters
        ----------
        url : str
            URL of the CGI script to call
            HeritagePlatform.get_url() can be used to generate supported URLs
        options : dict
            Dictionary containing valid options for the script
        attempts : int, optional
            Number of attempts for the exponential backoff
            The default is `self.request_attempts`.
        timeout : int, optional
            Timeout for the HTTP request in seconds.
            The default is `self.request_timeout`.

        Returns
        -------
        str
            Result (HTML) obtained. Returns ``None`` when every attempt fails.
        """

        attempts = attempts or self.request_attempts
        timeout = timeout or self.request_timeout
        query_string = build_query_string(options)
        query_url = f"{url}?{query_string}"
        return self.__get(query_url, attempts, timeout)

    @functools.lru_cache(maxsize=None)
    def __get(self, query_url: str, attempts: int, timeout: int):
        """
        Query web with exponential-backoff

        Parameters
        ----------
        query_url : str
            URL to query
        attempts : int, optional
            Number of attempts for the exponential backoff
        timeout : int, optional
            Timeout for the HTTP request in seconds.

        Returns
        -------
        str
            Result (HTML) obtained
        """
        return self._query_with_backoff(query_url, attempts, timeout)

[docs]    def _query_with_backoff(
        self, query_url: str, attempts: int, timeout: int
    ):
        """
        Fetch a URL with exponential backoff and robust decoding.

        Returns decoded response text on success, otherwise ``None``.
        """
        attempts = max(1, attempts)
        last_error = None
        response = None

        for attempt in range(attempts):
            try:
                response = requests.get(query_url, timeout=timeout)
            except requests.RequestException as exc:
                last_error = exc
                self.logger.warning(
                    "Attempt %s/%s failed for %s: %s",
                    attempt + 1,
                    attempts,
                    query_url,
                    exc,
                )
            else:
                if response.status_code == requests.codes.ok:
                    return self._response_text(response)

                self.logger.warning(
                    "Status code %s on attempt %s/%s for %s",
                    response.status_code,
                    attempt + 1,
                    attempts,
                    query_url,
                )

            if attempt < attempts - 1:
                backoff = (2 ** attempt) + random.random()
                time.sleep(backoff)

        if last_error is not None:
            self.logger.error(
                "Unable to fetch %s after %s attempts due to network errors.",
                query_url,
                attempts,
                exc_info=last_error,
            )
        elif response is not None:
            self.logger.error(
                "Unable to fetch %s after %s attempts. Last status: %s",
                query_url,
                attempts,
                response.status_code,
            )
        return None

[docs]    @staticmethod
    def _response_text(response: requests.Response) -> str:
        """Return response body decoded as UTF-8, avoiding mojibake."""
        encoding = response.encoding
        if not encoding or encoding.lower() in _LATIN_FALLBACK_ENCODINGS:
            encoding = response.apparent_encoding or "utf-8"

        try:
            return response.content.decode(encoding or "utf-8")
        except UnicodeDecodeError:
            return response.content.decode("utf-8", errors="replace")

    # ----------------------------------------------------------------------- #

[docs]    def get_result_from_shell(
        self, path: str, options: dict, timeout: int = 30
    ):
        """
        Get results from the Heritage Platform's local installation via shell

        Parameters
        ----------
        path : str
            Path to the executable script
            HeritagePlatform.get_path() can be used to generate supported paths
        options : dict
            Valid options for the script
        timeout : int, optional
            Timeout in seconds, after which the function will abort.
            The default is 30.

        Returns
        -------
        result : str
            Result (HTML) obtained
        """
        query_string = build_query_string(options)
        env = os.environ.copy()
        env["QUERY_STRING"] = query_string
        environment = frozendict(env)
        return self.__run(path, environment, timeout=timeout)

    @functools.lru_cache(maxsize=None)
    def __run(self, path, environment: dict, timeout: int = 30):
        """
        Get results from shell through a subprocess call

        Parameters
        ----------
        path : str
            Path to the executable script
        environment : dict
            Environment variables to set
        timeout : int, optional
            Timeout in seconds, after which the function will abort.
            The default is 30.

        Returns
        -------
        result : str
            Result (HTML) obtained
        """
        try:
            result_header = "Content-Type: text/html\n\n"
            result = subprocess.check_output(
                path, env=environment, timeout=timeout
            ).decode("utf-8")
            result = result[len(result_header) :]
        except subprocess.TimeoutExpired:
            self.logger.error("Timeout while executing '%s'.", path)
            return None
        except subprocess.SubprocessError as exc:
            self.logger.error("Subprocess error while executing '%s': %s", path, exc)
            return None
        except OSError as exc:
            self.logger.error("OS error while executing '%s': %s", path, exc)
            return None
        return result

    # ----------------------------------------------------------------------- #

[docs]    def get_result(self, action: str, options: dict, *args, **kwargs):
        """
        High-level function to obtain result for various actions

        Avoids the hassle of generating the URL or PATH.
        Utilizes the HeritagePlatform.method attribute to determine
        whether to fetch through shell or web.

        Parameters
        ----------
        action : str
            Action value corresponding to the utility to be used.
            Refer to HeritagePlatform.ACTIONS
        options : dict
            Valid options for the specified action

        Returns
        -------
        str
            Result (HTML) obtained
        """
        if self.method == "shell":
            path = self.get_path(action)
            return self.get_result_from_shell(path, options, *args, **kwargs)
        if self.method == "web":
            url = self.get_url(action)
            return self.get_result_from_web(url, options, *args, **kwargs)
        self.logger.error(f"Invalid method: '{self.method}'.")

    ###########################################################################

[docs]    def get_method(self):
        """Get the current method"""
        return self.method

[docs]    def set_method(self, method: str):
        """
        Set method for fetching the output

        Valid methods are listed in HeritagePlatform.METHODS
        """
        if method.lower() in self.METHODS:
            self.method = method.lower()
            return True
        self.logger.warning(f"Invalid method: '{method}'")
        if self.method is None:
            self.method = self.DEFAULT_METHOD
        return False

    # ----------------------------------------------------------------------- #

[docs]    def get_option(self, opt_name: str):
        """Get the value of global options"""
        if opt_name not in self.OPTIONS:
            self.logger.warning(f"Invalid option: '{opt_name}'")
            return None
        return self.options.get(opt_name, None)

[docs]    def set_option(self, opt_name: str, opt_value: str):
        """Set global options

        Any of these options, if expected by a particular utility from the
        Heritage Platform, will be directly used in the QUERY_STRING while
        fetching the output from that utility

        class variable OPTIONS stores the default values for options

        Each option contains,
        - a 'description' of the option
        - 'values' it can take (and descriptions of those values)
        - 'default' value

        """

        opt_name = opt_name.lower()
        if opt_name not in self.OPTIONS:
            self.logger.warning(f"Invalid option: '{opt_name}'")
            return False

        if opt_value in self.OPTIONS[opt_name]["values"]:
            self.options[opt_name] = opt_value
            return True

        self.logger.warning(
            f"Invalid value for option '{opt_name}': '{opt_value}'"
        )
        return False

    # ----------------------------------------------------------------------- #

[docs]    def get_font(self):
        """Get current font for Sanskrit Output"""
        return self.get_option("font")

[docs]    def set_font(self, font: str):
        """Set font for Sanskrit output"""
        return self.set_option("font", font.lower())

    # ----------------------------------------------------------------------- #

[docs]    def get_lexicon(self):
        """Get current lexicon"""
        return self.get_option("lex")

[docs]    def set_lexicon(self, lexicon: str):
        """Set lexicon"""
        return self.set_option("lex", lexicon.upper())

    ###########################################################################
    # URL or Path Builders

[docs]    def get_url(self, action: str):
        """URL Builder"""
        return urllib.parse.urljoin(self.base_url, self.ACTIONS[action]["web"])

[docs]    def get_path(self, action: str):
        """Path Builder"""
        return os.path.join(self.scripts_dir, self.ACTIONS[action]["shell"])

    ###########################################################################

[docs]    def valid_installation(self):
        """Check if the Heritage Platform installation exists"""
        # TODO: A better check may be checking for the required executables
        # * If the file exists
        # * If the file is executable
        return os.path.isdir(self.scripts_dir)

    ###########################################################################

    def __repr__(self):
        params = {
            "repository": self.base_dir,
            "url": self.base_url,
            "method": self.method,
        }
        repr_params = ", ".join([f'{k}="{v}"' for k, v in params.items()])
        return f"{self.__class__.__name__}({repr_params})"

    ###########################################################################
    # TODO: Move these to utils.py ??

[docs]    @staticmethod
    def prepare_input(input_text: str):
        """
        Prepare Input
            * Convert Devanagari to Velthuis
            * Join words by '+' instead of by whitespaces
        """
        return "+".join(devanagari_to_velthuis(input_text).split())

[docs]    @staticmethod
    def identify_gender(gender: str):
        genders = {
            "Mas": ["पु", "m"],
            "Fem": ["स्त्री", "f"],
            "Neu": ["नपु", "n"],
            "Any": ["*", "त्रि", "a"],
        }
        for gender_key, gender_list in genders.items():
            for g in gender_list:
                if gender.lower().startswith(g):
                    return gender_key


###############################################################################