Source code for heritage.heritage
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Python Interface to The Sanskrit Heritage Site
Use The Sanskrit Heritage Platform using,
* Web mirror
- no installation required
- makes HTTP requests
* Local installation
- faster
- uses console
- no HTTP requests required
Using Local Installation
------------------------
- Heritage_Platform/ML/ contains the scripts
- export QUERY_STRING as shell variable
(referred to as OPTION_STRING in this code alongwith the '&text=TEXT' part)
- execute various scripts, such as ./reader
- still produces HTML output that needs to be parsed
# Default input needs to be in the devanagari format
# utils.devanagari_to_velthuis() function will convert this to VH
"""
###############################################################################
import os
import re
import time
import random
import logging
import functools
import subprocess
import urllib.parse
from dataclasses import dataclass, field
from typing import Dict, Optional
import requests
import bs4
from .constants import HERITAGE_COLOURS
from .models import (
AnalysisCandidate,
ConjugationCell,
ConjugationTable,
DeclensionTable,
DictionaryEntry,
SearchResult,
SolutionAnalysis,
WordAnalysis,
WordRole,
)
from .utils import build_query_string, devanagari_to_velthuis
###############################################################################
###############################################################################
DEFAULT_REQUEST_TIMEOUT = 10
DEFAULT_REQUEST_ATTEMPTS = 3
_LATIN_FALLBACK_ENCODINGS = {"iso-8859-1", "latin1", "latin-1"}
###############################################################################
# TODO: Do we need to use python-frozendict (PyPI)?
[docs]def freezeargs(func):
"""
Transform mutable dictionnary arguments into immutable frozen ones
Useful to be compatible with @cache. Should be added on top of @cache
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
args = tuple(
[frozendict(arg) if isinstance(arg, dict) else arg for arg in args]
)
kwargs = {
k: frozendict(v) if isinstance(v, dict) else v
for k, v in kwargs.items()
}
return func(*args, **kwargs)
for method in ["cache_info", "cache_clear"]:
if callable(getattr(func, method, None)):
setattr(wrapper, method, getattr(func, method))
return wrapper
###############################################################################
[docs]@dataclass
class HeritageAnalysis:
case: str = field(default=None)
number: str = field(default=None)
gender: str = field(default=None)
tense: str = field(default=None)
###############################################################################
[docs]class HeritageOutput:
"""
Heritage Output Parser
Parse output generated by various utilities from Heritage Platform
"""
CLASSES = {"footer": ["enpied"]}
def __init__(self, html: str):
self.logger = logging.getLogger(__name__)
self.html = html
self.soup = bs4.BeautifulSoup(html, "html.parser")
self.process()
[docs] def process(self, html: str = None):
"""Process the html and extract basic information"""
# Allow re-using of the class
if html is not None:
self.html = html
self.soup = bs4.BeautifulSoup(html, "html.parser")
self.body = self.soup.find("body")
if self.body is None:
self.logger.error("No <body> tag found in HTML.")
self.footer = None
self.title = self.soup.find("title")
self.inner_title = None
self.blocks = []
return
self.footer = self.body.find("div", class_=self.CLASSES["footer"])
# Extract Meta Information
self.meta = {}
for meta in self.soup.find_all("meta"):
if meta.get("name", ""):
self.meta[meta.get("name")] = meta.get("content", "")
if meta.get("property", ""):
self.meta[meta.get("property")] = meta.get("content", "")
# Extract Title
self.title = self.soup.find("title")
self.inner_title = self.body.find("h1", class_="title")
# Find Relevant Body Children
self.blocks = self.body.find_all()
[docs] def extract_analysis(
self, meta: bool = False, structured: bool = False
):
"""
Extract analysis from HTML
Parameters
----------
meta : bool
If True, include meta information, i.e, parse options, classes
The default is False.
structured : bool
If True, return dataclass-based representations.
The default is False (legacy dictionaries).
"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if self.title.text != "Sanskrit Reader Companion":
self.logger.error("Invalid output page.")
return None
hr_blocks = self.html.split("<hr>")
if len(hr_blocks) < 2:
self.logger.error("No solutions found.")
return None
solutions = {}
for block in hr_blocks[2:]:
if "Solution" not in block:
break
solution = {}
soup = bs4.BeautifulSoup(block, "html.parser")
first_span = soup.find("span")
solution_id = int(first_span.text.split()[1])
solution["id"] = solution_id
solution["words"] = []
if meta:
parser_url = first_span.find("a")["href"]
# TODO: Better parsing of options
parser_options = dict(
[
e.split("=")
for e in re.split(
r"&|&|;", parser_url.split("?")[1]
)
]
)
solution["parser_options"] = parser_options
tables = soup.find_all("table")
current_text = None
for table in tables:
if table.find("table"):
prev = table.previous_sibling
current_text = (
prev.get_text()
if isinstance(prev, bs4.element.Tag)
else str(prev).strip()
)
else:
# Inner table contains analysis and it occurs after
# the original word
analyses = self.parse_analysis(
table, structured=structured
)
css_classes = table.get("class", [])
if meta:
word_classes = css_classes
categories = [
HERITAGE_COLOURS.get(css_class.split("_back")[0], None)
for css_class in css_classes
]
if structured:
solution["words"].append(
WordAnalysis(
text=current_text or "",
category=categories,
classes=css_classes,
candidates=analyses,
)
)
else:
word = {"text": current_text or ""}
if meta:
word["classes"] = word_classes
word["category"] = categories
word_analyses = []
for analysis in analyses:
word_copy = word.copy()
word_copy.update(analysis)
word_analyses.append(word_copy)
solution["words"].append(word_analyses)
solutions[solution_id] = solution
if structured:
structured_solutions: Dict[int, SolutionAnalysis] = {}
for solution_id, raw_solution in solutions.items():
parser_options = (
raw_solution.get("parser_options")
if meta
else None
)
structured_solutions[solution_id] = SolutionAnalysis(
id=solution_id,
words=raw_solution["words"],
parser_options=parser_options,
)
return structured_solutions
return solutions
[docs] def extract_parse(self, structured: bool = False):
"""Extract parse from HTML"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if self.title.text != "Sanskrit Reader Assistant":
self.logger.error("Invalid output page.")
return None
word_nodes = self.soup.find_all("table", class_="yellow_back")
roles = []
for word_node in word_nodes:
word_text = word_node.get_text().strip()
word_row = word_node.find_parent("tr")
tables = word_row.find_all("table")
# analysis_table = tables[1]
# word_id_table = tables[2]
semantic_table = tables[3]
semantic_rows = semantic_table.find_all("tr")
word_roles = [row.get_text() for row in semantic_rows]
if structured:
roles.append(WordRole(text=word_text, roles=word_roles))
else:
roles.append({"text": word_text, "roles": word_roles})
return roles
[docs] def extract_declensions(
self, headers: bool = True, structured: bool = False
):
"""
Extract declension tables from HTML.
When ``structured`` is True, returns a :class:`DeclensionTable`
instance; otherwise returns a nested list of header/body cells.
"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if self.title.text != "Sanskrit Grammarian Declension Engine":
self.logger.error("Invalid output page.")
return None
table = self.soup.find("table", class_="inflexion")
if table is None:
self.logger.error("Declension table not found in HTML.")
return None
rows = table.find_all("tr")
output = []
for row in rows:
cols = [col.get_text(" ").split() for col in row.find_all("th")]
output.append(cols)
output = output[:2] + output[3:] + [output[2]]
if not headers:
output = [row[1:] for row in output[1:]]
if structured:
flattened = [
[" ".join(cell).strip() for cell in row] for row in output
]
if headers:
return DeclensionTable(
headers=flattened[0],
rows=flattened[1:],
)
return DeclensionTable(headers=[], rows=flattened)
return output
[docs] def extract_conjugations(
self, headers: bool = True, structured: bool = False
):
"""
Extract conjugation tables from HTML.
When ``structured`` is True, returns a list of
:class:`ConjugationTable` objects; otherwise a nested dictionary
keyed by table headings.
"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if self.title.text != "Sanskrit Grammarian Conjugation Engine":
self.logger.error("Invalid output page.")
return None
tables = self.soup.find_all("table", class_="gris_cent")
forms = {} if not structured else []
for table in tables:
header = table.find("span").get_text()
if not structured:
forms[header] = {}
inner_tables = table.find_all("table", class_="inflexion")
structured_cells = []
for inner_table in inner_tables:
rows = inner_table.find_all("tr")
output = []
for row in rows:
cols = [
col.get_text(" ").split() for col in row.find_all("th")
]
output.append(cols)
if structured:
heading = " ".join(output[0][0]).strip()
flattened_rows = [
[" ".join(cell).strip() for cell in row]
for row in (output[1:] if headers else output)
]
structured_cells.append(
ConjugationCell(heading=heading, rows=flattened_rows)
)
else:
forms[header][output[0][0][0]] = output
if structured:
forms.append(ConjugationTable(title=header, cells=structured_cells))
return forms
[docs] def extract_sandhi(self):
"""Extract Sandhi from HTML"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if self.title.text != "Sanskrit Sandhi Engine":
self.logger.error("Invalid output page.")
return None
pattern = r"\s*([^\s\|]*)\s*\|\s*([^\s=]*)\s*=\s*([^\s]*)\s*"
for span in self.body.find_all("span"):
match = re.match(pattern, span.get_text(" "), flags=re.DOTALL)
if match:
return match.group(3)
[docs] def extract_lexicon_entry(self, word_id: str):
"""Extract entry from a lexicon"""
if not self.title or not self.title.text:
self.logger.error("Missing or empty <title> tag.")
return None
if "Monier-Williams Sanskrit-English" not in self.title.text:
self.logger.error("Invalid dictionary page.")
return None
marker = self.soup.find("a", attrs={"name": word_id})
if marker is None:
self.logger.error(
"Dictionary entry with id '%s' not found.", word_id
)
return None
parent = marker.find_parent()
container = marker.find_parent("span")
if container is None:
container = parent
lemma = marker.get_text(strip=True)
if not lemma:
italic = container.find("i")
lemma = italic.get_text(strip=True) if italic else word_id
entry_html = str(container)
entry_text = container.get_text(" ", strip=True)
return DictionaryEntry(lemma=lemma, html=entry_html, text=entry_text)
[docs] def extract_search_results(self, structured: bool = True):
"""Extract dictionary search results."""
result_table = self.soup.find("table")
if result_table is None:
self.logger.error("Could not locate results table.")
return None
results = []
for row in result_table.find_all("tr"):
cols = row.find_all(["td", "th"])
if not cols:
continue
link_tag = cols[0].find("a")
entry = (
link_tag.get_text(strip=True)
if link_tag
else cols[0].get_text(strip=True)
)
if not entry:
continue
link = link_tag["href"] if link_tag else None
summary_parts = [
col.get_text(" ", strip=True) for col in cols[1:]
]
summary = " ".join(part for part in summary_parts if part)
if structured:
results.append(
SearchResult(entry=entry, link=link, summary=summary)
)
else:
results.append(
{
"entry": entry,
"link": link,
"summary": summary,
}
)
return results
[docs] @staticmethod
def parse_analysis(
table: bs4.element.Tag, structured: bool = False
):
"""
Parse analysis of a single word
Analysis Format is: [root]{analysis_1 | analysis_2 | ..}
Parameters
----------
table : bs4.element.Tag
Valid `table` element
Returns
-------
analysies : list
"""
# pattern = r'\[([^\]]*)\]\{([^\}]*)\}'
pattern = r"\[(.*?)\]\{([^\}]*)\}"
rows = table.find_all("tr")
analyses = []
logger = logging.getLogger(__name__)
for row in rows:
if row is None:
continue
link = row.find("a")
if link is not None:
link_parts = link["href"].split("/")[-1].split("#")
file_name, word_id = link_parts[0], link_parts[1]
else:
file_name, word_id = None, None
row_text = row.get_text().strip()
match = re.match(pattern, row_text, flags=re.DOTALL)
if match is None:
logger.debug("Unable to parse analysis row: %s", row_text)
continue
parsed_analyses = [
[abbrev.replace(".", "") for abbrev in an.split()]
for an in match.group(2).split("|")
]
if structured:
analyses.append(
AnalysisCandidate(
root=match.group(1).split()[0].strip(),
analyses=parsed_analyses,
lexicon_reference=(file_name, word_id),
)
)
else:
analyses.append(
{
"lexicon": (file_name, word_id),
"root": match.group(1).split()[0].strip(),
"analyses": parsed_analyses,
}
)
return analyses
def __repr__(self):
return repr(self.soup)
###############################################################################
[docs]class HeritagePlatform:
"""
The Sanskrit Heritage Platform
Access various utilities from The Sanskrit Heritage Platform
"""
INRIA_URL = "https://sanskrit.inria.fr/cgi-bin/SKT/"
ACTIONS = {
"reader": {"shell": "reader", "web": "sktreader.cgi"},
"parser": {"shell": "parser", "web": "sktparser.cgi"},
"search": {"shell": "indexer", "web": "sktindex.cgi"},
"search_easy": {"shell": "indexerd", "web": "sktsearch.cgi"},
"declension": {"shell": "declension", "web": "sktdeclin.cgi"},
"conjugation": {"shell": "conjugation", "web": "sktconjug.cgi"},
"lemma": {"shell": "lemmatizer", "web": "sktlemmatizer.cgi"},
"sandhi": {"shell": "sandhier", "web": "sktsandhier.cgi"},
"user": {"shell": "user_aid", "web": "sktuser.cgi"},
"interface": {"shell": "interface", "web": "sktgraph.cgi"},
"dictionary": {"shell": "../MW/", "web": "../../MW/"},
}
OPTIONS = {
"lex": {
"description": "Lexicon",
"values": {
"MW": "Monier-Williams Dictionary (English)",
"SH": "Sanskrit Heritage Dictionary (French)",
},
"default": "MW",
},
"font": {
"description": "Font for Sanskrit output",
"values": {"deva": "Devanagari", "roma": "Roman (IAST)"},
"default": "deva",
},
"t": {
"description": "Internal Transliteration Scheme",
"values": {"VH": "Velthuis"},
"default": "VH",
},
}
METHODS = ["shell", "web"]
DEFAULT_METHOD = "shell"
[docs] def __init__(
self,
base_dir: str = "",
base_url: str = None,
method: str = "shell",
**kwargs,
):
"""
Initialize Heritage Class
Parameters
----------
base_dir : str
Path to the Heritage_Platform repository.
The directory should contain 'ML' sub-directory,
which further contains the scripts
base_url : str, optional
URL for the Heritage Platform Mirror.
If None, the official INRIA website will be used.
The default is None.
method : str, optional
Method used to obtain results. Results can be obtained either using
the web installation or using UNIX shell.
Possible values are, 'shell' and 'web'
The default is 'shell'.
**kwargs :
Additional configuration keywords. Supported values are:
* ``request_timeout`` (int): timeout for HTTP requests in seconds.
* ``request_attempts`` (int): number of HTTP retries before giving up.
"""
self.logger = logging.getLogger(__name__)
self.base_url = self.INRIA_URL if base_url is None else base_url
self.base_dir = base_dir
self.scripts_dir = os.path.join(self.base_dir, "ML")
self.request_timeout = kwargs.pop(
"request_timeout", DEFAULT_REQUEST_TIMEOUT
)
self.request_attempts = kwargs.pop(
"request_attempts", DEFAULT_REQUEST_ATTEMPTS
)
self.method = None
self.set_method(method)
if not self.valid_installation():
self.logger.warning(
"Heritage Platform installation not found. "
"Falling back to `method=\"web\"`."
)
self.base_dir = ""
self.scripts_dir = ""
self.set_method("web")
self.options = {}
for option in self.OPTIONS:
self.options[option] = self.OPTIONS[option]["default"]
###########################################################################
# Utilities (Actions)
[docs] def get_analysis(
self,
input_text: str,
sentence: bool = True,
unsandhied: bool = False,
meta: bool = False,
structured: bool = True,
):
"""
Obtain morphological analyses using The Sanskrit Reader Companion
Parameters
----------
input_text : str
Input text to analyse
sentence : bool, optional
The input is treated as a sentence, if true, otherwise as a word.
The default is True.
unsandhied : bool, optional
If True, the input text is assumed to not contain sandhi.
The default is False.
meta : bool, optional
The option is passed to HeritageOutput.extract_analysis().
The default is False.
structured : bool, optional
Return dataclass objects if True, otherwise legacy dictionaries.
The default is True.
Returns
-------
dict[int, SolutionAnalysis] | dict
Dictionary of valid morphological analyses with solution_id as keys
"""
opt_st = "t" if sentence else "f"
opt_us = "t" if unsandhied else "f"
options = {
"lex": self.get_lexicon(),
"cache": "t", # Use Cache (t)rue, (f)alse
"st": opt_st, # Sentence (t)rue, Word (f)alse
"us": opt_us, # Unsandhied (t)rue, (f)alse
# if 'us' is 'f', "ca eva" is parsed as "ca_eva",
# "tathā eva" as "tathā_eva" etc.
"cp": "t", # Full Parser Strength (t)rue, (f)alse
"t": self.get_option("t"),
"mode": "p", # Parse Mode (p)arsing, (t)agging
# Tagging does not prune any solutions
"font": self.get_font(),
# Output Display Font (deva)nagari (roma)n
"topic": "",
"corpmode": "",
"corpdir": "",
"sentno": "",
"text": self.prepare_input(input_text),
}
result = self.get_result("reader", options)
if result is None:
return None
output = HeritageOutput(result)
# return output
return output.extract_analysis(meta=meta, structured=structured)
# ----------------------------------------------------------------------- #
[docs] def get_parse(
self,
input_text: str,
solution_id: int = None,
sentence: bool = True,
unsandhied: bool = False,
):
"""
Obtain parse of a sentence using The Sanskrit Reader Companion
Parameters
----------
input_text : str
Input text to analyse
solution_id : int, optional
Solution ID to parse.
If None, the first solution ID is used.
The default is None.
sentence : bool, optional
The input is treated as a sentence, if true, otherwise as a word.
The option is passed to HeritagePlatform.get_analysis().
The default is True.
unsandhied : bool, optional
If True, the input text is assumed to not contain sandhi.
The option is passed to HeritagePlatform.get_analysis().
The default is False.
Returns
-------
SolutionAnalysis | dict
Parse of the sentence. By default a
:class:`heritage.models.SolutionAnalysis` instance is returned,
but legacy dictionary outputs are still supported when using the
non-structured APIs.
"""
solutions = self.get_analysis(
input_text,
sentence=sentence,
unsandhied=unsandhied,
meta=True,
structured=True,
)
# If solution ID not provided, use the first solution
if solution_id is None:
if not solutions:
return None # TODO: Change this to something ?
solution_id = next(iter(solutions))
# No need to manually give options again, since it does it for us
# Internally parser is a re-run of reader until a specific solution
# Remove following block in later versions
# opt_st = 't' if sentence else 'f'
# opt_us = 't' if unsandhied else 'f'
# options = {
# 'lex': self.get_lexicon(),
# 'cache': 't', # Use Cache (t)rue, (f)alse
# 'st': opt_st, # Sentence (t)rue, Word (f)alse
# 'us': opt_us, # Unsandhied (t)rue, (f)alse
# # if 'us' is 'f', "ca eva" is parsed as "ca_eva",
# # "tathā eva" as "tathā_eva" etc.
# 'cp': 't', # Full Parser Strength (t)rue, (f)alse
# 't': self.get_option('t'),
# 'mode': 'p', # Parse Mode (p)arse, (g)raph, (s)ummary
# 'font': self.get_font(),
# # Output Display Font (deva)nagari (roma)n
# 'topic': '',
# 'n': solution_id,
# 'abs': 'f', # TODO: Find out what this does
# 'text': self.prepare_input(input_text)
# }
solution = solutions[solution_id]
parser_options = solution.parser_options if isinstance(
solution, SolutionAnalysis
) else solution["parser_options"]
options = dict(parser_options or {})
result = self.get_result("parser", options)
if result is None:
return None
output = HeritageOutput(result)
roles = output.extract_parse(structured=isinstance(solution, SolutionAnalysis))
if isinstance(solution, SolutionAnalysis):
solution.roles = roles
return solution
solution["roles"] = roles
return solution
# ----------------------------------------------------------------------- #
[docs] def sandhi(self, word_1: str, word_2: str, mode: str = "internal"):
"""
Join two words by forming a Sandhi
Parameters
----------
word_1 : str
The first (left) word in the Sandhi
word_2 : str
The second (right) word in the Sandhi
mode : str, optional
Indicates whether the words join to form a single word or not
Possible values are,
* internal
* external
The default is 'internal'.
Returns
-------
sandhi : str
String obtained by forming the Sandhi
"""
if mode not in ["internal", "external"]:
self.logger.warning(f"Invalid mode: '{mode}'")
options = {
"lex": self.get_lexicon(),
"l": self.prepare_input(word_1),
"r": self.prepare_input(word_2),
"t": self.get_option("t"),
"k": mode,
"font": self.get_font(),
}
result = self.get_result("sandhi", options)
if result is None:
return None
output = HeritageOutput(result)
return output.extract_sandhi()
# ----------------------------------------------------------------------- #
[docs] def search_inflected_form(self, word: str, category: str):
"""
Search an inflected form
Parameters
----------
word : str
Sanskrit Word to search (in Devanagari)
category : str
Type of the word
* Noun: Noun
* Pron: Pronoun
* Part: Participle
* Inde: Indeclinible
* Absya, Abstvaa, Voca, Iic, Ifc, Iiv, Piic etc.
Returns
-------
matches : list
List of matches.
"""
options = {
"t": self.get_option("t"),
"q": self.prepare_input(word),
"c": category,
"font": self.get_font(),
}
result = self.get_result("lemma", options)
if result is None:
return None
output = HeritageOutput(result)
# TODO: Output Parsing
return output
# ----------------------------------------------------------------------- #
[docs] def get_declensions(
self,
word: str,
gender: str,
headers: bool = True,
lexicon: str = None,
structured: bool = True,
):
"""
Retrieve declension tables from the Grammarian.
Parameters
----------
word : str
Input word in Devanagari.
gender : str
Gender hint. Accepted values include short forms (``m``, ``f``,
``n``) and Sanskrit labels (e.g. ``पु``, ``स्त्री``).
headers : bool, optional
If ``True``, include header row information. The default is True.
lexicon : str, optional
Reserved for future use. Currently ignored.
structured : bool, optional
When ``True`` (the default), returns a
:class:`heritage.models.DeclensionTable` instance. When ``False``,
returns the raw nested list produced by
:meth:`HeritageOutput.extract_declensions`.
Returns
-------
DeclensionTable | list | None
Structured table, legacy list-of-lists, or ``None`` when no table
can be extracted.
"""
options = {
"lex": self.get_lexicon(),
"t": self.get_option("t"),
"q": self.prepare_input(word),
"g": self.identify_gender(gender),
"font": self.get_font(),
}
result = self.get_result("declension", options)
if result is None:
return None
output = HeritageOutput(result)
return output.extract_declensions(
headers=headers, structured=structured
)
# ----------------------------------------------------------------------- #
[docs] def get_conjugations(
self,
word: str,
gana: str,
lexicon: str = None,
headers: bool = True,
structured: bool = True,
):
"""
Retrieve conjugation paradigms from the Grammarian.
Parameters
----------
word : str
Verbal root in Devanagari.
gana : str
Verbal class (gaṇa) identifier expected by the backend.
lexicon : str, optional
Reserved for future use. Currently ignored.
headers : bool, optional
If ``True``, treat the first row of each table as a heading.
structured : bool, optional
When ``True`` (the default), returns a list of
:class:`heritage.models.ConjugationTable` objects. When ``False``,
returns the legacy dictionary-of-tables output.
Returns
-------
list[ConjugationTable] | dict | None
Structured tables, legacy mapping, or ``None`` on failure.
"""
options = {
"lex": self.get_lexicon(),
"t": self.get_option("t"),
"q": self.prepare_input(word),
"c": gana,
"font": self.get_font(),
}
result = self.get_result("conjugation", options)
if result is None:
return None
output = HeritageOutput(result)
return output.extract_conjugations(
headers=headers, structured=structured
)
# ----------------------------------------------------------------------- #
[docs] def search_lexicon(
self, word: str, lexicon: str = None, structured: bool = True
):
"""Search a word in the dictionary.
Parameters
----------
word : str
Sanskrit Word to search (in Devanagari)
lexicon : str, optional
Lexicon to search the word in.
Possible values are,
- MW: Monier-Williams Dictionary
- SH: Heritage Dictionary
The default is 'MW'.
Returns
-------
list[SearchResult] | list[dict] | None
Parsed search results (the default), legacy dictionaries when
``structured`` is False, or ``None`` when the backend response
cannot be parsed.
"""
options = {
"lex": self.get_lexicon(),
"t": self.get_option("t"),
"q": self.prepare_input(word),
"font": self.get_font(),
}
result = self.get_result("search", options)
if result is None:
return None
output = HeritageOutput(result)
# TODO: Currently not using the lexicon keyword argument
# Is there any use for that argument? For this function?
return output.extract_search_results(structured=structured)
###########################################################################
[docs] @functools.lru_cache(maxsize=None)
def get_lexicon_entry(self, file_name: str, word_id: str):
"""
Fetch a single dictionary entry by its file and anchor identifier.
The implementation reuses the same HTML parser used for direct search
results and returns a :class:`heritage.models.DictionaryEntry`
instance.
Parameters
----------
file_name : str
Name of the HTML file containing the entry.
word_id : str
Anchor identifier within the dictionary page.
Returns
-------
DictionaryEntry | None
Parsed entry when available, otherwise ``None``.
"""
if self.method == "shell":
path = self.get_path("dictionary")
file_path = os.path.join(path, file_name)
with open(file_path, encoding="utf-8") as f:
content = f.read()
elif self.method == "web":
url = self.get_url("dictionary")
query_url = f"{url}{file_name}#{word_id}"
content = self.__get(
query_url, self.request_attempts, self.request_timeout
)
else:
self.logger.error(f"Invalid method: '{self.method}'.")
return
if content is None:
return None
output = HeritageOutput(content)
return output.extract_lexicon_entry(word_id)
###########################################################################
# Fetch Result through Web or Shell
[docs] def get_result_from_web(
self,
url: str,
options: dict,
attempts: int = None,
timeout: int = None,
):
"""
Get results from the Heritage Platform web mirror
Exponential backoff is used in case there are network errors
Parameters
----------
url : str
URL of the CGI script to call
HeritagePlatform.get_url() can be used to generate supported URLs
options : dict
Dictionary containing valid options for the script
attempts : int, optional
Number of attempts for the exponential backoff
The default is `self.request_attempts`.
timeout : int, optional
Timeout for the HTTP request in seconds.
The default is `self.request_timeout`.
Returns
-------
str
Result (HTML) obtained. Returns ``None`` when every attempt fails.
"""
attempts = attempts or self.request_attempts
timeout = timeout or self.request_timeout
query_string = build_query_string(options)
query_url = f"{url}?{query_string}"
return self.__get(query_url, attempts, timeout)
@functools.lru_cache(maxsize=None)
def __get(self, query_url: str, attempts: int, timeout: int):
"""
Query web with exponential-backoff
Parameters
----------
query_url : str
URL to query
attempts : int, optional
Number of attempts for the exponential backoff
timeout : int, optional
Timeout for the HTTP request in seconds.
Returns
-------
str
Result (HTML) obtained
"""
return self._query_with_backoff(query_url, attempts, timeout)
[docs] def _query_with_backoff(
self, query_url: str, attempts: int, timeout: int
):
"""
Fetch a URL with exponential backoff and robust decoding.
Returns decoded response text on success, otherwise ``None``.
"""
attempts = max(1, attempts)
last_error = None
response = None
for attempt in range(attempts):
try:
response = requests.get(query_url, timeout=timeout)
except requests.RequestException as exc:
last_error = exc
self.logger.warning(
"Attempt %s/%s failed for %s: %s",
attempt + 1,
attempts,
query_url,
exc,
)
else:
if response.status_code == requests.codes.ok:
return self._response_text(response)
self.logger.warning(
"Status code %s on attempt %s/%s for %s",
response.status_code,
attempt + 1,
attempts,
query_url,
)
if attempt < attempts - 1:
backoff = (2 ** attempt) + random.random()
time.sleep(backoff)
if last_error is not None:
self.logger.error(
"Unable to fetch %s after %s attempts due to network errors.",
query_url,
attempts,
exc_info=last_error,
)
elif response is not None:
self.logger.error(
"Unable to fetch %s after %s attempts. Last status: %s",
query_url,
attempts,
response.status_code,
)
return None
[docs] @staticmethod
def _response_text(response: requests.Response) -> str:
"""Return response body decoded as UTF-8, avoiding mojibake."""
encoding = response.encoding
if not encoding or encoding.lower() in _LATIN_FALLBACK_ENCODINGS:
encoding = response.apparent_encoding or "utf-8"
try:
return response.content.decode(encoding or "utf-8")
except UnicodeDecodeError:
return response.content.decode("utf-8", errors="replace")
# ----------------------------------------------------------------------- #
[docs] def get_result_from_shell(
self, path: str, options: dict, timeout: int = 30
):
"""
Get results from the Heritage Platform's local installation via shell
Parameters
----------
path : str
Path to the executable script
HeritagePlatform.get_path() can be used to generate supported paths
options : dict
Valid options for the script
timeout : int, optional
Timeout in seconds, after which the function will abort.
The default is 30.
Returns
-------
result : str
Result (HTML) obtained
"""
query_string = build_query_string(options)
env = os.environ.copy()
env["QUERY_STRING"] = query_string
environment = frozendict(env)
return self.__run(path, environment, timeout=timeout)
@functools.lru_cache(maxsize=None)
def __run(self, path, environment: dict, timeout: int = 30):
"""
Get results from shell through a subprocess call
Parameters
----------
path : str
Path to the executable script
environment : dict
Environment variables to set
timeout : int, optional
Timeout in seconds, after which the function will abort.
The default is 30.
Returns
-------
result : str
Result (HTML) obtained
"""
try:
result_header = "Content-Type: text/html\n\n"
result = subprocess.check_output(
path, env=environment, timeout=timeout
).decode("utf-8")
result = result[len(result_header) :]
except subprocess.TimeoutExpired:
self.logger.error("Timeout while executing '%s'.", path)
return None
except subprocess.SubprocessError as exc:
self.logger.error("Subprocess error while executing '%s': %s", path, exc)
return None
except OSError as exc:
self.logger.error("OS error while executing '%s': %s", path, exc)
return None
return result
# ----------------------------------------------------------------------- #
[docs] def get_result(self, action: str, options: dict, *args, **kwargs):
"""
High-level function to obtain result for various actions
Avoids the hassle of generating the URL or PATH.
Utilizes the HeritagePlatform.method attribute to determine
whether to fetch through shell or web.
Parameters
----------
action : str
Action value corresponding to the utility to be used.
Refer to HeritagePlatform.ACTIONS
options : dict
Valid options for the specified action
Returns
-------
str
Result (HTML) obtained
"""
if self.method == "shell":
path = self.get_path(action)
return self.get_result_from_shell(path, options, *args, **kwargs)
if self.method == "web":
url = self.get_url(action)
return self.get_result_from_web(url, options, *args, **kwargs)
self.logger.error(f"Invalid method: '{self.method}'.")
###########################################################################
[docs] def set_method(self, method: str):
"""
Set method for fetching the output
Valid methods are listed in HeritagePlatform.METHODS
"""
if method.lower() in self.METHODS:
self.method = method.lower()
return True
self.logger.warning(f"Invalid method: '{method}'")
if self.method is None:
self.method = self.DEFAULT_METHOD
return False
# ----------------------------------------------------------------------- #
[docs] def get_option(self, opt_name: str):
"""Get the value of global options"""
if opt_name not in self.OPTIONS:
self.logger.warning(f"Invalid option: '{opt_name}'")
return None
return self.options.get(opt_name, None)
[docs] def set_option(self, opt_name: str, opt_value: str):
"""Set global options
Any of these options, if expected by a particular utility from the
Heritage Platform, will be directly used in the QUERY_STRING while
fetching the output from that utility
class variable OPTIONS stores the default values for options
Each option contains,
- a 'description' of the option
- 'values' it can take (and descriptions of those values)
- 'default' value
"""
opt_name = opt_name.lower()
if opt_name not in self.OPTIONS:
self.logger.warning(f"Invalid option: '{opt_name}'")
return False
if opt_value in self.OPTIONS[opt_name]["values"]:
self.options[opt_name] = opt_value
return True
self.logger.warning(
f"Invalid value for option '{opt_name}': '{opt_value}'"
)
return False
# ----------------------------------------------------------------------- #
[docs] def get_font(self):
"""Get current font for Sanskrit Output"""
return self.get_option("font")
[docs] def set_font(self, font: str):
"""Set font for Sanskrit output"""
return self.set_option("font", font.lower())
# ----------------------------------------------------------------------- #
[docs] def set_lexicon(self, lexicon: str):
"""Set lexicon"""
return self.set_option("lex", lexicon.upper())
###########################################################################
# URL or Path Builders
[docs] def get_url(self, action: str):
"""URL Builder"""
return urllib.parse.urljoin(self.base_url, self.ACTIONS[action]["web"])
[docs] def get_path(self, action: str):
"""Path Builder"""
return os.path.join(self.scripts_dir, self.ACTIONS[action]["shell"])
###########################################################################
[docs] def valid_installation(self):
"""Check if the Heritage Platform installation exists"""
# TODO: A better check may be checking for the required executables
# * If the file exists
# * If the file is executable
return os.path.isdir(self.scripts_dir)
###########################################################################
def __repr__(self):
params = {
"repository": self.base_dir,
"url": self.base_url,
"method": self.method,
}
repr_params = ", ".join([f'{k}="{v}"' for k, v in params.items()])
return f"{self.__class__.__name__}({repr_params})"
###########################################################################
# TODO: Move these to utils.py ??
[docs] @staticmethod
def prepare_input(input_text: str):
"""
Prepare Input
* Convert Devanagari to Velthuis
* Join words by '+' instead of by whitespaces
"""
return "+".join(devanagari_to_velthuis(input_text).split())
[docs] @staticmethod
def identify_gender(gender: str):
genders = {
"Mas": ["पु", "m"],
"Fem": ["स्त्री", "f"],
"Neu": ["नपु", "n"],
"Any": ["*", "त्रि", "a"],
}
for gender_key, gender_list in genders.items():
for g in gender_list:
if gender.lower().startswith(g):
return gender_key
###############################################################################