Source code for heritage.heritage

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Python Interface to The Sanskrit Heritage Site

Use The Sanskrit Heritage Platform using,

* Web mirror
  - no installation required
  - makes HTTP requests
* Local installation
  - faster
  - uses console
  - no HTTP requests required

Using Local Installation
------------------------
- Heritage_Platform/ML/ contains the scripts
- export QUERY_STRING as shell variable
  (referred to as OPTION_STRING in this code alongwith the '&text=TEXT' part)
- execute various scripts, such as ./reader
- still produces HTML output that needs to be parsed

# Default input needs to be in the devanagari format
# utils.devanagari_to_velthuis() function will convert this to VH
"""

###############################################################################

import os
import re
import time
import random
import logging
import functools
import subprocess
import urllib.parse
from dataclasses import dataclass, field
from typing import Dict, Optional

import requests
import bs4

from .constants import HERITAGE_COLOURS
from .models import (
    AnalysisCandidate,
    ConjugationCell,
    ConjugationTable,
    DeclensionTable,
    DictionaryEntry,
    SearchResult,
    SolutionAnalysis,
    WordAnalysis,
    WordRole,
)
from .utils import build_query_string, devanagari_to_velthuis


###############################################################################

###############################################################################

DEFAULT_REQUEST_TIMEOUT = 10
DEFAULT_REQUEST_ATTEMPTS = 3
_LATIN_FALLBACK_ENCODINGS = {"iso-8859-1", "latin1", "latin-1"}

###############################################################################
# TODO: Do we need to use python-frozendict (PyPI)?


[docs]class frozendict(dict): def __hash__(self): return hash(frozenset(self.items()))
[docs]def freezeargs(func): """ Transform mutable dictionnary arguments into immutable frozen ones Useful to be compatible with @cache. Should be added on top of @cache """ @functools.wraps(func) def wrapper(*args, **kwargs): args = tuple( [frozendict(arg) if isinstance(arg, dict) else arg for arg in args] ) kwargs = { k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items() } return func(*args, **kwargs) for method in ["cache_info", "cache_clear"]: if callable(getattr(func, method, None)): setattr(wrapper, method, getattr(func, method)) return wrapper
###############################################################################
[docs]@dataclass class HeritageAnalysis: case: str = field(default=None) number: str = field(default=None) gender: str = field(default=None) tense: str = field(default=None)
[docs]@dataclass class Token: pass
###############################################################################
[docs]class HeritageOutput: """ Heritage Output Parser Parse output generated by various utilities from Heritage Platform """ CLASSES = {"footer": ["enpied"]} def __init__(self, html: str): self.logger = logging.getLogger(__name__) self.html = html self.soup = bs4.BeautifulSoup(html, "html.parser") self.process()
[docs] def process(self, html: str = None): """Process the html and extract basic information""" # Allow re-using of the class if html is not None: self.html = html self.soup = bs4.BeautifulSoup(html, "html.parser") self.body = self.soup.find("body") if self.body is None: self.logger.error("No <body> tag found in HTML.") self.footer = None self.title = self.soup.find("title") self.inner_title = None self.blocks = [] return self.footer = self.body.find("div", class_=self.CLASSES["footer"]) # Extract Meta Information self.meta = {} for meta in self.soup.find_all("meta"): if meta.get("name", ""): self.meta[meta.get("name")] = meta.get("content", "") if meta.get("property", ""): self.meta[meta.get("property")] = meta.get("content", "") # Extract Title self.title = self.soup.find("title") self.inner_title = self.body.find("h1", class_="title") # Find Relevant Body Children self.blocks = self.body.find_all()
[docs] def extract_analysis( self, meta: bool = False, structured: bool = False ): """ Extract analysis from HTML Parameters ---------- meta : bool If True, include meta information, i.e, parse options, classes The default is False. structured : bool If True, return dataclass-based representations. The default is False (legacy dictionaries). """ if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if self.title.text != "Sanskrit Reader Companion": self.logger.error("Invalid output page.") return None hr_blocks = self.html.split("<hr>") if len(hr_blocks) < 2: self.logger.error("No solutions found.") return None solutions = {} for block in hr_blocks[2:]: if "Solution" not in block: break solution = {} soup = bs4.BeautifulSoup(block, "html.parser") first_span = soup.find("span") solution_id = int(first_span.text.split()[1]) solution["id"] = solution_id solution["words"] = [] if meta: parser_url = first_span.find("a")["href"] # TODO: Better parsing of options parser_options = dict( [ e.split("=") for e in re.split( r"&amp;|&|;", parser_url.split("?")[1] ) ] ) solution["parser_options"] = parser_options tables = soup.find_all("table") current_text = None for table in tables: if table.find("table"): prev = table.previous_sibling current_text = ( prev.get_text() if isinstance(prev, bs4.element.Tag) else str(prev).strip() ) else: # Inner table contains analysis and it occurs after # the original word analyses = self.parse_analysis( table, structured=structured ) css_classes = table.get("class", []) if meta: word_classes = css_classes categories = [ HERITAGE_COLOURS.get(css_class.split("_back")[0], None) for css_class in css_classes ] if structured: solution["words"].append( WordAnalysis( text=current_text or "", category=categories, classes=css_classes, candidates=analyses, ) ) else: word = {"text": current_text or ""} if meta: word["classes"] = word_classes word["category"] = categories word_analyses = [] for analysis in analyses: word_copy = word.copy() word_copy.update(analysis) word_analyses.append(word_copy) solution["words"].append(word_analyses) solutions[solution_id] = solution if structured: structured_solutions: Dict[int, SolutionAnalysis] = {} for solution_id, raw_solution in solutions.items(): parser_options = ( raw_solution.get("parser_options") if meta else None ) structured_solutions[solution_id] = SolutionAnalysis( id=solution_id, words=raw_solution["words"], parser_options=parser_options, ) return structured_solutions return solutions
[docs] def extract_parse(self, structured: bool = False): """Extract parse from HTML""" if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if self.title.text != "Sanskrit Reader Assistant": self.logger.error("Invalid output page.") return None word_nodes = self.soup.find_all("table", class_="yellow_back") roles = [] for word_node in word_nodes: word_text = word_node.get_text().strip() word_row = word_node.find_parent("tr") tables = word_row.find_all("table") # analysis_table = tables[1] # word_id_table = tables[2] semantic_table = tables[3] semantic_rows = semantic_table.find_all("tr") word_roles = [row.get_text() for row in semantic_rows] if structured: roles.append(WordRole(text=word_text, roles=word_roles)) else: roles.append({"text": word_text, "roles": word_roles}) return roles
[docs] def extract_declensions( self, headers: bool = True, structured: bool = False ): """ Extract declension tables from HTML. When ``structured`` is True, returns a :class:`DeclensionTable` instance; otherwise returns a nested list of header/body cells. """ if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if self.title.text != "Sanskrit Grammarian Declension Engine": self.logger.error("Invalid output page.") return None table = self.soup.find("table", class_="inflexion") if table is None: self.logger.error("Declension table not found in HTML.") return None rows = table.find_all("tr") output = [] for row in rows: cols = [col.get_text(" ").split() for col in row.find_all("th")] output.append(cols) output = output[:2] + output[3:] + [output[2]] if not headers: output = [row[1:] for row in output[1:]] if structured: flattened = [ [" ".join(cell).strip() for cell in row] for row in output ] if headers: return DeclensionTable( headers=flattened[0], rows=flattened[1:], ) return DeclensionTable(headers=[], rows=flattened) return output
[docs] def extract_conjugations( self, headers: bool = True, structured: bool = False ): """ Extract conjugation tables from HTML. When ``structured`` is True, returns a list of :class:`ConjugationTable` objects; otherwise a nested dictionary keyed by table headings. """ if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if self.title.text != "Sanskrit Grammarian Conjugation Engine": self.logger.error("Invalid output page.") return None tables = self.soup.find_all("table", class_="gris_cent") forms = {} if not structured else [] for table in tables: header = table.find("span").get_text() if not structured: forms[header] = {} inner_tables = table.find_all("table", class_="inflexion") structured_cells = [] for inner_table in inner_tables: rows = inner_table.find_all("tr") output = [] for row in rows: cols = [ col.get_text(" ").split() for col in row.find_all("th") ] output.append(cols) if structured: heading = " ".join(output[0][0]).strip() flattened_rows = [ [" ".join(cell).strip() for cell in row] for row in (output[1:] if headers else output) ] structured_cells.append( ConjugationCell(heading=heading, rows=flattened_rows) ) else: forms[header][output[0][0][0]] = output if structured: forms.append(ConjugationTable(title=header, cells=structured_cells)) return forms
[docs] def extract_sandhi(self): """Extract Sandhi from HTML""" if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if self.title.text != "Sanskrit Sandhi Engine": self.logger.error("Invalid output page.") return None pattern = r"\s*([^\s\|]*)\s*\|\s*([^\s=]*)\s*=\s*([^\s]*)\s*" for span in self.body.find_all("span"): match = re.match(pattern, span.get_text(" "), flags=re.DOTALL) if match: return match.group(3)
[docs] def extract_lexicon_entry(self, word_id: str): """Extract entry from a lexicon""" if not self.title or not self.title.text: self.logger.error("Missing or empty <title> tag.") return None if "Monier-Williams Sanskrit-English" not in self.title.text: self.logger.error("Invalid dictionary page.") return None marker = self.soup.find("a", attrs={"name": word_id}) if marker is None: self.logger.error( "Dictionary entry with id '%s' not found.", word_id ) return None parent = marker.find_parent() container = marker.find_parent("span") if container is None: container = parent lemma = marker.get_text(strip=True) if not lemma: italic = container.find("i") lemma = italic.get_text(strip=True) if italic else word_id entry_html = str(container) entry_text = container.get_text(" ", strip=True) return DictionaryEntry(lemma=lemma, html=entry_html, text=entry_text)
[docs] def extract_search_results(self, structured: bool = True): """Extract dictionary search results.""" result_table = self.soup.find("table") if result_table is None: self.logger.error("Could not locate results table.") return None results = [] for row in result_table.find_all("tr"): cols = row.find_all(["td", "th"]) if not cols: continue link_tag = cols[0].find("a") entry = ( link_tag.get_text(strip=True) if link_tag else cols[0].get_text(strip=True) ) if not entry: continue link = link_tag["href"] if link_tag else None summary_parts = [ col.get_text(" ", strip=True) for col in cols[1:] ] summary = " ".join(part for part in summary_parts if part) if structured: results.append( SearchResult(entry=entry, link=link, summary=summary) ) else: results.append( { "entry": entry, "link": link, "summary": summary, } ) return results
[docs] @staticmethod def parse_analysis( table: bs4.element.Tag, structured: bool = False ): """ Parse analysis of a single word Analysis Format is: [root]{analysis_1 | analysis_2 | ..} Parameters ---------- table : bs4.element.Tag Valid `table` element Returns ------- analysies : list """ # pattern = r'\[([^\]]*)\]\{([^\}]*)\}' pattern = r"\[(.*?)\]\{([^\}]*)\}" rows = table.find_all("tr") analyses = [] logger = logging.getLogger(__name__) for row in rows: if row is None: continue link = row.find("a") if link is not None: link_parts = link["href"].split("/")[-1].split("#") file_name, word_id = link_parts[0], link_parts[1] else: file_name, word_id = None, None row_text = row.get_text().strip() match = re.match(pattern, row_text, flags=re.DOTALL) if match is None: logger.debug("Unable to parse analysis row: %s", row_text) continue parsed_analyses = [ [abbrev.replace(".", "") for abbrev in an.split()] for an in match.group(2).split("|") ] if structured: analyses.append( AnalysisCandidate( root=match.group(1).split()[0].strip(), analyses=parsed_analyses, lexicon_reference=(file_name, word_id), ) ) else: analyses.append( { "lexicon": (file_name, word_id), "root": match.group(1).split()[0].strip(), "analyses": parsed_analyses, } ) return analyses
def __repr__(self): return repr(self.soup)
###############################################################################
[docs]class HeritagePlatform: """ The Sanskrit Heritage Platform Access various utilities from The Sanskrit Heritage Platform """ INRIA_URL = "https://sanskrit.inria.fr/cgi-bin/SKT/" ACTIONS = { "reader": {"shell": "reader", "web": "sktreader.cgi"}, "parser": {"shell": "parser", "web": "sktparser.cgi"}, "search": {"shell": "indexer", "web": "sktindex.cgi"}, "search_easy": {"shell": "indexerd", "web": "sktsearch.cgi"}, "declension": {"shell": "declension", "web": "sktdeclin.cgi"}, "conjugation": {"shell": "conjugation", "web": "sktconjug.cgi"}, "lemma": {"shell": "lemmatizer", "web": "sktlemmatizer.cgi"}, "sandhi": {"shell": "sandhier", "web": "sktsandhier.cgi"}, "user": {"shell": "user_aid", "web": "sktuser.cgi"}, "interface": {"shell": "interface", "web": "sktgraph.cgi"}, "dictionary": {"shell": "../MW/", "web": "../../MW/"}, } OPTIONS = { "lex": { "description": "Lexicon", "values": { "MW": "Monier-Williams Dictionary (English)", "SH": "Sanskrit Heritage Dictionary (French)", }, "default": "MW", }, "font": { "description": "Font for Sanskrit output", "values": {"deva": "Devanagari", "roma": "Roman (IAST)"}, "default": "deva", }, "t": { "description": "Internal Transliteration Scheme", "values": {"VH": "Velthuis"}, "default": "VH", }, } METHODS = ["shell", "web"] DEFAULT_METHOD = "shell"
[docs] def __init__( self, base_dir: str = "", base_url: str = None, method: str = "shell", **kwargs, ): """ Initialize Heritage Class Parameters ---------- base_dir : str Path to the Heritage_Platform repository. The directory should contain 'ML' sub-directory, which further contains the scripts base_url : str, optional URL for the Heritage Platform Mirror. If None, the official INRIA website will be used. The default is None. method : str, optional Method used to obtain results. Results can be obtained either using the web installation or using UNIX shell. Possible values are, 'shell' and 'web' The default is 'shell'. **kwargs : Additional configuration keywords. Supported values are: * ``request_timeout`` (int): timeout for HTTP requests in seconds. * ``request_attempts`` (int): number of HTTP retries before giving up. """ self.logger = logging.getLogger(__name__) self.base_url = self.INRIA_URL if base_url is None else base_url self.base_dir = base_dir self.scripts_dir = os.path.join(self.base_dir, "ML") self.request_timeout = kwargs.pop( "request_timeout", DEFAULT_REQUEST_TIMEOUT ) self.request_attempts = kwargs.pop( "request_attempts", DEFAULT_REQUEST_ATTEMPTS ) self.method = None self.set_method(method) if not self.valid_installation(): self.logger.warning( "Heritage Platform installation not found. " "Falling back to `method=\"web\"`." ) self.base_dir = "" self.scripts_dir = "" self.set_method("web") self.options = {} for option in self.OPTIONS: self.options[option] = self.OPTIONS[option]["default"]
########################################################################### # Utilities (Actions)
[docs] def get_analysis( self, input_text: str, sentence: bool = True, unsandhied: bool = False, meta: bool = False, structured: bool = True, ): """ Obtain morphological analyses using The Sanskrit Reader Companion Parameters ---------- input_text : str Input text to analyse sentence : bool, optional The input is treated as a sentence, if true, otherwise as a word. The default is True. unsandhied : bool, optional If True, the input text is assumed to not contain sandhi. The default is False. meta : bool, optional The option is passed to HeritageOutput.extract_analysis(). The default is False. structured : bool, optional Return dataclass objects if True, otherwise legacy dictionaries. The default is True. Returns ------- dict[int, SolutionAnalysis] | dict Dictionary of valid morphological analyses with solution_id as keys """ opt_st = "t" if sentence else "f" opt_us = "t" if unsandhied else "f" options = { "lex": self.get_lexicon(), "cache": "t", # Use Cache (t)rue, (f)alse "st": opt_st, # Sentence (t)rue, Word (f)alse "us": opt_us, # Unsandhied (t)rue, (f)alse # if 'us' is 'f', "ca eva" is parsed as "ca_eva", # "tathā eva" as "tathā_eva" etc. "cp": "t", # Full Parser Strength (t)rue, (f)alse "t": self.get_option("t"), "mode": "p", # Parse Mode (p)arsing, (t)agging # Tagging does not prune any solutions "font": self.get_font(), # Output Display Font (deva)nagari (roma)n "topic": "", "corpmode": "", "corpdir": "", "sentno": "", "text": self.prepare_input(input_text), } result = self.get_result("reader", options) if result is None: return None output = HeritageOutput(result) # return output return output.extract_analysis(meta=meta, structured=structured)
# ----------------------------------------------------------------------- #
[docs] def get_parse( self, input_text: str, solution_id: int = None, sentence: bool = True, unsandhied: bool = False, ): """ Obtain parse of a sentence using The Sanskrit Reader Companion Parameters ---------- input_text : str Input text to analyse solution_id : int, optional Solution ID to parse. If None, the first solution ID is used. The default is None. sentence : bool, optional The input is treated as a sentence, if true, otherwise as a word. The option is passed to HeritagePlatform.get_analysis(). The default is True. unsandhied : bool, optional If True, the input text is assumed to not contain sandhi. The option is passed to HeritagePlatform.get_analysis(). The default is False. Returns ------- SolutionAnalysis | dict Parse of the sentence. By default a :class:`heritage.models.SolutionAnalysis` instance is returned, but legacy dictionary outputs are still supported when using the non-structured APIs. """ solutions = self.get_analysis( input_text, sentence=sentence, unsandhied=unsandhied, meta=True, structured=True, ) # If solution ID not provided, use the first solution if solution_id is None: if not solutions: return None # TODO: Change this to something ? solution_id = next(iter(solutions)) # No need to manually give options again, since it does it for us # Internally parser is a re-run of reader until a specific solution # Remove following block in later versions # opt_st = 't' if sentence else 'f' # opt_us = 't' if unsandhied else 'f' # options = { # 'lex': self.get_lexicon(), # 'cache': 't', # Use Cache (t)rue, (f)alse # 'st': opt_st, # Sentence (t)rue, Word (f)alse # 'us': opt_us, # Unsandhied (t)rue, (f)alse # # if 'us' is 'f', "ca eva" is parsed as "ca_eva", # # "tathā eva" as "tathā_eva" etc. # 'cp': 't', # Full Parser Strength (t)rue, (f)alse # 't': self.get_option('t'), # 'mode': 'p', # Parse Mode (p)arse, (g)raph, (s)ummary # 'font': self.get_font(), # # Output Display Font (deva)nagari (roma)n # 'topic': '', # 'n': solution_id, # 'abs': 'f', # TODO: Find out what this does # 'text': self.prepare_input(input_text) # } solution = solutions[solution_id] parser_options = solution.parser_options if isinstance( solution, SolutionAnalysis ) else solution["parser_options"] options = dict(parser_options or {}) result = self.get_result("parser", options) if result is None: return None output = HeritageOutput(result) roles = output.extract_parse(structured=isinstance(solution, SolutionAnalysis)) if isinstance(solution, SolutionAnalysis): solution.roles = roles return solution solution["roles"] = roles return solution
# ----------------------------------------------------------------------- #
[docs] def sandhi(self, word_1: str, word_2: str, mode: str = "internal"): """ Join two words by forming a Sandhi Parameters ---------- word_1 : str The first (left) word in the Sandhi word_2 : str The second (right) word in the Sandhi mode : str, optional Indicates whether the words join to form a single word or not Possible values are, * internal * external The default is 'internal'. Returns ------- sandhi : str String obtained by forming the Sandhi """ if mode not in ["internal", "external"]: self.logger.warning(f"Invalid mode: '{mode}'") options = { "lex": self.get_lexicon(), "l": self.prepare_input(word_1), "r": self.prepare_input(word_2), "t": self.get_option("t"), "k": mode, "font": self.get_font(), } result = self.get_result("sandhi", options) if result is None: return None output = HeritageOutput(result) return output.extract_sandhi()
# ----------------------------------------------------------------------- #
[docs] def search_inflected_form(self, word: str, category: str): """ Search an inflected form Parameters ---------- word : str Sanskrit Word to search (in Devanagari) category : str Type of the word * Noun: Noun * Pron: Pronoun * Part: Participle * Inde: Indeclinible * Absya, Abstvaa, Voca, Iic, Ifc, Iiv, Piic etc. Returns ------- matches : list List of matches. """ options = { "t": self.get_option("t"), "q": self.prepare_input(word), "c": category, "font": self.get_font(), } result = self.get_result("lemma", options) if result is None: return None output = HeritageOutput(result) # TODO: Output Parsing return output
# ----------------------------------------------------------------------- #
[docs] def get_declensions( self, word: str, gender: str, headers: bool = True, lexicon: str = None, structured: bool = True, ): """ Retrieve declension tables from the Grammarian. Parameters ---------- word : str Input word in Devanagari. gender : str Gender hint. Accepted values include short forms (``m``, ``f``, ``n``) and Sanskrit labels (e.g. ``पु``, ``स्त्री``). headers : bool, optional If ``True``, include header row information. The default is True. lexicon : str, optional Reserved for future use. Currently ignored. structured : bool, optional When ``True`` (the default), returns a :class:`heritage.models.DeclensionTable` instance. When ``False``, returns the raw nested list produced by :meth:`HeritageOutput.extract_declensions`. Returns ------- DeclensionTable | list | None Structured table, legacy list-of-lists, or ``None`` when no table can be extracted. """ options = { "lex": self.get_lexicon(), "t": self.get_option("t"), "q": self.prepare_input(word), "g": self.identify_gender(gender), "font": self.get_font(), } result = self.get_result("declension", options) if result is None: return None output = HeritageOutput(result) return output.extract_declensions( headers=headers, structured=structured )
# ----------------------------------------------------------------------- #
[docs] def get_conjugations( self, word: str, gana: str, lexicon: str = None, headers: bool = True, structured: bool = True, ): """ Retrieve conjugation paradigms from the Grammarian. Parameters ---------- word : str Verbal root in Devanagari. gana : str Verbal class (gaṇa) identifier expected by the backend. lexicon : str, optional Reserved for future use. Currently ignored. headers : bool, optional If ``True``, treat the first row of each table as a heading. structured : bool, optional When ``True`` (the default), returns a list of :class:`heritage.models.ConjugationTable` objects. When ``False``, returns the legacy dictionary-of-tables output. Returns ------- list[ConjugationTable] | dict | None Structured tables, legacy mapping, or ``None`` on failure. """ options = { "lex": self.get_lexicon(), "t": self.get_option("t"), "q": self.prepare_input(word), "c": gana, "font": self.get_font(), } result = self.get_result("conjugation", options) if result is None: return None output = HeritageOutput(result) return output.extract_conjugations( headers=headers, structured=structured )
# ----------------------------------------------------------------------- #
[docs] def search_lexicon( self, word: str, lexicon: str = None, structured: bool = True ): """Search a word in the dictionary. Parameters ---------- word : str Sanskrit Word to search (in Devanagari) lexicon : str, optional Lexicon to search the word in. Possible values are, - MW: Monier-Williams Dictionary - SH: Heritage Dictionary The default is 'MW'. Returns ------- list[SearchResult] | list[dict] | None Parsed search results (the default), legacy dictionaries when ``structured`` is False, or ``None`` when the backend response cannot be parsed. """ options = { "lex": self.get_lexicon(), "t": self.get_option("t"), "q": self.prepare_input(word), "font": self.get_font(), } result = self.get_result("search", options) if result is None: return None output = HeritageOutput(result) # TODO: Currently not using the lexicon keyword argument # Is there any use for that argument? For this function? return output.extract_search_results(structured=structured)
###########################################################################
[docs] @functools.lru_cache(maxsize=None) def get_lexicon_entry(self, file_name: str, word_id: str): """ Fetch a single dictionary entry by its file and anchor identifier. The implementation reuses the same HTML parser used for direct search results and returns a :class:`heritage.models.DictionaryEntry` instance. Parameters ---------- file_name : str Name of the HTML file containing the entry. word_id : str Anchor identifier within the dictionary page. Returns ------- DictionaryEntry | None Parsed entry when available, otherwise ``None``. """ if self.method == "shell": path = self.get_path("dictionary") file_path = os.path.join(path, file_name) with open(file_path, encoding="utf-8") as f: content = f.read() elif self.method == "web": url = self.get_url("dictionary") query_url = f"{url}{file_name}#{word_id}" content = self.__get( query_url, self.request_attempts, self.request_timeout ) else: self.logger.error(f"Invalid method: '{self.method}'.") return if content is None: return None output = HeritageOutput(content) return output.extract_lexicon_entry(word_id)
########################################################################### # Fetch Result through Web or Shell
[docs] def get_result_from_web( self, url: str, options: dict, attempts: int = None, timeout: int = None, ): """ Get results from the Heritage Platform web mirror Exponential backoff is used in case there are network errors Parameters ---------- url : str URL of the CGI script to call HeritagePlatform.get_url() can be used to generate supported URLs options : dict Dictionary containing valid options for the script attempts : int, optional Number of attempts for the exponential backoff The default is `self.request_attempts`. timeout : int, optional Timeout for the HTTP request in seconds. The default is `self.request_timeout`. Returns ------- str Result (HTML) obtained. Returns ``None`` when every attempt fails. """ attempts = attempts or self.request_attempts timeout = timeout or self.request_timeout query_string = build_query_string(options) query_url = f"{url}?{query_string}" return self.__get(query_url, attempts, timeout)
@functools.lru_cache(maxsize=None) def __get(self, query_url: str, attempts: int, timeout: int): """ Query web with exponential-backoff Parameters ---------- query_url : str URL to query attempts : int, optional Number of attempts for the exponential backoff timeout : int, optional Timeout for the HTTP request in seconds. Returns ------- str Result (HTML) obtained """ return self._query_with_backoff(query_url, attempts, timeout)
[docs] def _query_with_backoff( self, query_url: str, attempts: int, timeout: int ): """ Fetch a URL with exponential backoff and robust decoding. Returns decoded response text on success, otherwise ``None``. """ attempts = max(1, attempts) last_error = None response = None for attempt in range(attempts): try: response = requests.get(query_url, timeout=timeout) except requests.RequestException as exc: last_error = exc self.logger.warning( "Attempt %s/%s failed for %s: %s", attempt + 1, attempts, query_url, exc, ) else: if response.status_code == requests.codes.ok: return self._response_text(response) self.logger.warning( "Status code %s on attempt %s/%s for %s", response.status_code, attempt + 1, attempts, query_url, ) if attempt < attempts - 1: backoff = (2 ** attempt) + random.random() time.sleep(backoff) if last_error is not None: self.logger.error( "Unable to fetch %s after %s attempts due to network errors.", query_url, attempts, exc_info=last_error, ) elif response is not None: self.logger.error( "Unable to fetch %s after %s attempts. Last status: %s", query_url, attempts, response.status_code, ) return None
[docs] @staticmethod def _response_text(response: requests.Response) -> str: """Return response body decoded as UTF-8, avoiding mojibake.""" encoding = response.encoding if not encoding or encoding.lower() in _LATIN_FALLBACK_ENCODINGS: encoding = response.apparent_encoding or "utf-8" try: return response.content.decode(encoding or "utf-8") except UnicodeDecodeError: return response.content.decode("utf-8", errors="replace")
# ----------------------------------------------------------------------- #
[docs] def get_result_from_shell( self, path: str, options: dict, timeout: int = 30 ): """ Get results from the Heritage Platform's local installation via shell Parameters ---------- path : str Path to the executable script HeritagePlatform.get_path() can be used to generate supported paths options : dict Valid options for the script timeout : int, optional Timeout in seconds, after which the function will abort. The default is 30. Returns ------- result : str Result (HTML) obtained """ query_string = build_query_string(options) env = os.environ.copy() env["QUERY_STRING"] = query_string environment = frozendict(env) return self.__run(path, environment, timeout=timeout)
@functools.lru_cache(maxsize=None) def __run(self, path, environment: dict, timeout: int = 30): """ Get results from shell through a subprocess call Parameters ---------- path : str Path to the executable script environment : dict Environment variables to set timeout : int, optional Timeout in seconds, after which the function will abort. The default is 30. Returns ------- result : str Result (HTML) obtained """ try: result_header = "Content-Type: text/html\n\n" result = subprocess.check_output( path, env=environment, timeout=timeout ).decode("utf-8") result = result[len(result_header) :] except subprocess.TimeoutExpired: self.logger.error("Timeout while executing '%s'.", path) return None except subprocess.SubprocessError as exc: self.logger.error("Subprocess error while executing '%s': %s", path, exc) return None except OSError as exc: self.logger.error("OS error while executing '%s': %s", path, exc) return None return result # ----------------------------------------------------------------------- #
[docs] def get_result(self, action: str, options: dict, *args, **kwargs): """ High-level function to obtain result for various actions Avoids the hassle of generating the URL or PATH. Utilizes the HeritagePlatform.method attribute to determine whether to fetch through shell or web. Parameters ---------- action : str Action value corresponding to the utility to be used. Refer to HeritagePlatform.ACTIONS options : dict Valid options for the specified action Returns ------- str Result (HTML) obtained """ if self.method == "shell": path = self.get_path(action) return self.get_result_from_shell(path, options, *args, **kwargs) if self.method == "web": url = self.get_url(action) return self.get_result_from_web(url, options, *args, **kwargs) self.logger.error(f"Invalid method: '{self.method}'.")
###########################################################################
[docs] def get_method(self): """Get the current method""" return self.method
[docs] def set_method(self, method: str): """ Set method for fetching the output Valid methods are listed in HeritagePlatform.METHODS """ if method.lower() in self.METHODS: self.method = method.lower() return True self.logger.warning(f"Invalid method: '{method}'") if self.method is None: self.method = self.DEFAULT_METHOD return False
# ----------------------------------------------------------------------- #
[docs] def get_option(self, opt_name: str): """Get the value of global options""" if opt_name not in self.OPTIONS: self.logger.warning(f"Invalid option: '{opt_name}'") return None return self.options.get(opt_name, None)
[docs] def set_option(self, opt_name: str, opt_value: str): """Set global options Any of these options, if expected by a particular utility from the Heritage Platform, will be directly used in the QUERY_STRING while fetching the output from that utility class variable OPTIONS stores the default values for options Each option contains, - a 'description' of the option - 'values' it can take (and descriptions of those values) - 'default' value """ opt_name = opt_name.lower() if opt_name not in self.OPTIONS: self.logger.warning(f"Invalid option: '{opt_name}'") return False if opt_value in self.OPTIONS[opt_name]["values"]: self.options[opt_name] = opt_value return True self.logger.warning( f"Invalid value for option '{opt_name}': '{opt_value}'" ) return False
# ----------------------------------------------------------------------- #
[docs] def get_font(self): """Get current font for Sanskrit Output""" return self.get_option("font")
[docs] def set_font(self, font: str): """Set font for Sanskrit output""" return self.set_option("font", font.lower())
# ----------------------------------------------------------------------- #
[docs] def get_lexicon(self): """Get current lexicon""" return self.get_option("lex")
[docs] def set_lexicon(self, lexicon: str): """Set lexicon""" return self.set_option("lex", lexicon.upper())
########################################################################### # URL or Path Builders
[docs] def get_url(self, action: str): """URL Builder""" return urllib.parse.urljoin(self.base_url, self.ACTIONS[action]["web"])
[docs] def get_path(self, action: str): """Path Builder""" return os.path.join(self.scripts_dir, self.ACTIONS[action]["shell"])
###########################################################################
[docs] def valid_installation(self): """Check if the Heritage Platform installation exists""" # TODO: A better check may be checking for the required executables # * If the file exists # * If the file is executable return os.path.isdir(self.scripts_dir)
########################################################################### def __repr__(self): params = { "repository": self.base_dir, "url": self.base_url, "method": self.method, } repr_params = ", ".join([f'{k}="{v}"' for k, v in params.items()]) return f"{self.__class__.__name__}({repr_params})" ########################################################################### # TODO: Move these to utils.py ??
[docs] @staticmethod def prepare_input(input_text: str): """ Prepare Input * Convert Devanagari to Velthuis * Join words by '+' instead of by whitespaces """ return "+".join(devanagari_to_velthuis(input_text).split())
[docs] @staticmethod def identify_gender(gender: str): genders = { "Mas": ["पु", "m"], "Fem": ["स्त्री", "f"], "Neu": ["नपु", "n"], "Any": ["*", "त्रि", "a"], } for gender_key, gender_list in genders.items(): for g in gender_list: if gender.lower().startswith(g): return gender_key
###############################################################################