Source code for segram.nlp.tokens.abc

"""Abstract base class for :mod:`segram`-enhanced :mod:`spacy` tokens."""
# pylint: disable=no-name-in-module
from typing import Any, Self
from abc import ABC
from functools import total_ordering
import numpy as np
from spacy.vocab import Vocab
from spacy.tokens import Doc, Span, Token
from spacy.tokens.underscore import Underscore
from ...utils.meta import init_class_attrs
from ...utils.misc import cosine_similarity
from ... import __title__


[docs] class NLP(ABC): """Abstract base class for NLP tokens. Attributes ---------- tok Base :mod:`spacy` token object. """ __slots__ = ("tok",) def __init__(self, tok: Doc | Span | Token) -> None: self.tok = tok
[docs] def __repr__(self) -> str: """String representation.""" return self.text
def __hash__(self) -> int: return hash((0, self.tok))
[docs] def __eq__(self, other: Self) -> bool: """Check equality with another token of the same type.""" if self.is_comparable_with(other) is True: return self.doc == other.doc return NotImplemented
def __init_subclass__(cls) -> None: super().__init_subclass__() if "__slots__" not in cls.__dict__: raise TypeError(f"'{cls.__name__}' does not define '__slots__'") init_class_attrs(cls, { "__slots__": "slot_names" }) try: total_ordering(cls) except ValueError: pass # Properties -------------------------------------------------------------- @property def text(self) -> str: return self.tok.text @property def doc(self) -> "Doc": return self.sns(self.tok.doc) @property def alias(self) -> str: return getattr(self.tok.doc._, __title__+"_alias") @property def lang(self) -> str: return self.doc.lang @property def vocab(self) -> Vocab: return self.tok.vocab @property def vector(self) -> np.ndarray[tuple[int], np.floating]: return self.tok.vector @property def has_vectors(self) -> bool: return self.vocab.vectors_length > 0 # Methods -----------------------------------------------------------------
[docs] def is_comparable_with(self, other: Any) -> bool: """Check if ``self`` defines the same abstract interface as ``other``.""" if not isinstance(other, NLP): return NotImplemented if self.doc is not other.doc: raise ValueError("'self' and 'other' are based on different documents") return isinstance(other, self.__class__) or NotImplemented
# Properties -------------------------------------------------------------- @property def _(self) -> Underscore: return self.tok._ # Methods -----------------------------------------------------------------
[docs] @classmethod def sns(cls, tok: Doc | Span | Token) -> Self: """Get :mod:`segram` namespace from :mod:`spacy` token.""" alias = getattr(tok.doc._, __title__+"_alias") return getattr(tok._, alias+"_sns")
def similarity(self, other: Doc | Span | Token) -> float: return cosine_similarity(self.vector, other.vector)