Source code for segram.grammar.abc

"""Base classes from which ABCs of concret grammar classes are derived.

Grammar classes provide building blocks for representing complex
syntactical relationships within sentences which go beyond simple
syntax tree links and can be used to perform various tasks such as
component and phrase detection.
"""
# pylint: disable=no-name-in-module
from typing import Any, Self, Callable, ClassVar, Final
from typing import MutableMapping, Container, Sequence
from abc import abstractmethod
import re
from functools import total_ordering
from catalogue import Registry
import numpy as np
from ..nlp.tokens import Doc, Span, Token
from ..utils.registries import grammars
from ..abc import SegramWithDocABC
from ..datastruct import Namespace, DataTuple
from ..utils.misc import cosine_similarity


[docs] class GrammarNamespace(Namespace): Grammar: type["Grammar"] Component: type["Component"] Verb: type["Verb"] Noun: type["Noun"] Prep: type["Prep"] Desc: type["Desc"] Phrase: type["Phrase"] VP: type["VerbPhrase"] NP: type["NounPhrase"] DP: type["DescPhrase"] PP: type["PrepPhrase"] Sent: type["Sent"] Doc: type["Doc"]
[docs] class Grammar(SegramWithDocABC, Container): """Abstract base class for grammar classes. All grammar classes must be defined as **slots** classes. This is necessary for ensuring low-memory footprint and better computational efficiency. Even classes with no new slots need to declare ``__slots__ = ()``. This requirement is checked during class construction. Other class-specific requirements of this sort as well as their related validation checks may be implemented on specialized grammar classes using the standard ``__init_subclass__`` interface. This allows abstract base classes further down the inheritance chain to check for more complex requirements as well as apply dynamic class customizations. """ __slots__ = () alias: ClassVar[str] = "Grammar" types: ClassVar[GrammarNamespace] = GrammarNamespace() roles: ClassVar[MutableMapping] = Namespace() grammars: Final[Registry] = grammars def __hash__(self) -> int: return super().__hash__() def __eq__(self, other: Self) -> bool: if isinstance(other, Grammar): return id(self.doc) == id(other.doc) return NotImplemented def __init_subclass__(cls, *, register: str | None = None) -> None: super().__init_subclass__() if register: cls.types = GrammarNamespace() cls.roles = Namespace() cls.grammars.register(register, func=cls) # Add to the members list --------------------------------------------- if (alias := getattr(cls, "alias", None)): if (t := cls.types.get(alias)) \ and (tpath := t.ppath()) != cls.ppath(): raise TypeError(f"'{alias}' already defined by '{tpath}'") cls.types[alias] = cls # Methods -----------------------------------------------------------------
[docs] @abstractmethod def to_data(self) -> dict[str, Any]: """Dump to data dictionary.""" raise NotImplementedError
[docs] class GrammarElement(Grammar, Sequence): """Abstract base class for grammar elements.""" __slots__ = () alias: ClassVar[str] = "GElem" Similarity: ClassVar[type["GrammarSimilarity"]] = type def __repr__(self) -> str: return self.to_str(color=True) def __hash__(self) -> int: return super().__hash__() def __eq__(self, other: Self) -> bool: if self.is_comparable_with(other): return self.idx == other.idx return False def __bool__(self) -> bool: return self.idx is not None # Properties -------------------------------------------------------------- @property @abstractmethod def idx(self) -> int | tuple[int, ...]: """Element index.""" raise NotImplementedError @property @abstractmethod def vector(self) -> np.ndarray[tuple[int], np.floating]: """Word vector.""" @property def tokens(self) -> DataTuple[Token]: """Tokens sequence of the element.""" return DataTuple(self) @property def text(self) -> str: """Raw text of element.""" return self.to_str() @property def lemma(self) -> str: return "".join(t.lemma+t.whitespace for t in self.tokens).strip() # Methods -----------------------------------------------------------------
[docs] @abstractmethod def to_str(self, **kwds: Any) -> str: """Represent as a string.""" raise NotImplementedError
[docs] def get_hashdata(self) -> tuple[Any, ...]: return (*super().get_hashdata(), self.idx)
[docs] def match( self, _pattern: str | None = None, _flag: re.RegexFlag = re.NOFLAG, _ignore_missing: bool = False, **kwds: Any | Callable[[Any], bool] ) -> re.Pattern | None: """Match element text against a regex pattern using :func:`re.search` function. Parameters ---------- _pattern Regular expression pattern used for matching. No matching is done when ``None``. _flag Regex flag. _ignore_missing_fields Should missing fields on ``self`` be ignored. **kwds Other keyword arguments can be used for testing values of different attributes on ``self``. If callables are passed as values then they are expected to be predicate functions returning boolean values. """ is_match = True if _pattern is not None: is_match &= bool(re.search(_pattern, self.text, _flag)) for field, test in kwds.items(): try: attr = getattr(self, field) except AttributeError as exc: if _ignore_missing: continue raise exc if isinstance(test, Callable): is_match &= bool(test(attr)) else: is_match &= attr == test return is_match
[docs] def similarity(self, other: Self) -> float: """Cosine similarity between word vectors.""" return cosine_similarity(self.vector, other.vector)
[docs] class DocElement(GrammarElement, Sequence): """Document element class.""" __slots__ = ("doc",) alias: ClassVar[str] = "DocElem" def __init__(self, doc: Doc) -> None: super().__init__() self.doc = doc def __getitem__(self, idx: int | slice) -> Token | Span: return self.doc[idx] def __len__(self) -> int: return len(self.doc) # Properties -------------------------------------------------------------- @property def idx(self) -> int: """Fast document id. It is stable for an instance, and allows for hashing, but is not stable for different objects with the same data, e.g. an element initialized from the same data twice may have differen ``.idx`` values each time. """ return hash(self.doc) @property def id(self) -> int: """Slow persistent document id. It will be always the same for documents based on the same exact data. """ return self.doc.id @property def vector(self) -> np.ndarray[tuple[int], np.floating]: return self.doc.vector # Methods -----------------------------------------------------------------
[docs] @classmethod @abstractmethod def from_data(cls, doc: Doc, data: dict[str, Any]) -> Self: """Construct NLP document and data dictionary."""
[docs] @total_ordering class SentElement(GrammarElement): """Grammar element based on a sentence span.""" __slots__ = ("sent", "_doc") alias: ClassVar[str] = "SentElem" def __init__(self, sent: Span) -> None: super().__init__() if sent.root.sent is not sent: raise ValueError("'sent' has to be a proper sentence span object") self.sent = sent self._doc = None def __lt__(self, other: Self) -> bool: if self.is_comparable_with(other): return self.idx < other.idx return NotImplemented def __getitem__(self, idx: int | slice) -> Token | Span: return self.sent[idx] def __len__(self) -> int: return len(self.sent) # Properties -------------------------------------------------------------- @property def doc(self) -> DocElement: if not self._doc: self._doc = self.sent.doc.grammar return self._doc @property def root(self) -> Token: return self.sent.root @property def start(self) -> int: return self.sent.start @property def end(self) -> int: return self.sent.end @property def idx(self) -> tuple[int, int]: """Sentence index equal to ``(self.start, self.end)`` allowing for identification/hashing and sorting within the parent document. """ return (self.start, self.end) @property def vector(self) -> np.ndarray[tuple[int], np.floating]: return self.sent.vector @property def is_correct(self) -> bool: """Indicates whether the sentence has been parsed correctly and has a well-defined root token. """ try: self.root except KeyError: return False return True # Methods -----------------------------------------------------------------
[docs] @classmethod @abstractmethod def from_data(cls, doc: Doc, data: dict[str, Any]) -> Self: """Construct from document and data dictionary."""
[docs] @total_ordering class TokenElement(GrammarElement): """Grammar element based on a token.""" __slots__ = ("tok", "_sent", "_doc") alias: ClassVar[str] = "TokElem" def __init__(self, tok: Token) -> None: super().__init__() self.tok = tok self._sent = None self._doc = None def __lt__(self, other: Self) -> bool: if self.is_comparable_with(other): return self.idx < other.idx return NotImplemented def __getitem__(self, idx: int | slice) -> Token | tuple[Token, ...]: return self.tokens[idx] def __len__(self) -> int: return len(self.tokens) # Properties -------------------------------------------------------------- @property def doc(self) -> DocElement: if not self._doc: self._doc = self.tok.doc.grammar return self._doc @property def sent(self) -> SentElement: if not self._sent: self._sent = self.tok.sent.grammar return self._sent @property def idx(self) -> int: """Token index within the parent document.""" return self.tok.i @property @abstractmethod def tokens(self) -> tuple[Token, ...]: pass @property def vector(self) -> np.ndarray[tuple[int], np.floating]: return self.tok.vector # Methods -----------------------------------------------------------------
[docs] @classmethod @abstractmethod def from_data(cls, doc: Doc, data: dict[str, Any]) -> Self: """Construct from document and data dictionary."""