Source code for segram.grammar.components

"""Abstract base class for grammar components.

Grammar components are groups of associated tokens controlled
by a root token, e.g. a verb with its auxiliary verbs.
"""
from typing import Any, Iterable, ClassVar, Self
from .abc import TokenElement
from .conjuncts import Conjuncts
from ..nlp.tokens import Token
from ..symbols import POS, Role, Tense, Modal, Mood, Symbol
from ..utils.misc import cosine_similarity
from ..datastruct import DataTuple
from ..nlp.tokens import Doc



[docs]
class Component(TokenElement):
    """Abstract base class for grammar components.

    Components consists of a root token associated with (optional)
    additional subordinated tokens, e.g. a noun and its determiner.

    Default syntactic role assigned to the given component type
    can be defined using ``__role__`` class attribute.

    This is a base class used for implementing concrete
    components classes. Names of all possible controlled tokens
    (e.g. ``neg`` for a negation token) must be defined in
    ``__tokens__`` class attributes along the iheritance chain
    of concrete subclasses up to :class:`GrammarComponent``.
    Each controlled token name should be declared only once and
    all must be present also in ``__slots__``. Component classes
    not defining any new controlled token slots have to define
    ``__tokens__ = ()``. The same rules apply to defining component
    attributes through ``__attrs__`` class attributes.

    The above requirements are checked at runtime during class creation.

    Attributes
    ----------
    sent
        Sentence the component belongs to.
    tok
        Head token object.
    role
        Syntactic role of the component head token.
    sub
        Tokens dependent on the head
        and not included in other token categories.
        They are not printed.
    qmark
        Question mark token.
    exclam
        Exclamation mark token.
    intj
        Interjection token.
    neg
        Negation token(s).
    """
    __role__ = None
    __tokens__ = ("qmark", "exclam", "intj", "neg")
    __slots__ = ("_tid", "role", "sub", *__tokens__)
    alias: ClassVar[str] = "Component"
    token_names: ClassVar[tuple[str, ...]] = ()
    attr_names: ClassVar[tuple[str, ...]] = ()

    def __init__(
        self,
        tok: Token,
        *,
        role: Role | None = None,
        sub: Iterable[Token] = (),
        qmark: Token | None = None,
        exclam: Token | None = None,
        intj: Token | None = None,
        neg: Token | None = None
    ) -> None:
        super().__init__(tok)
        self._tid = ()
        role = role or self.__role__
        self.role = Role.from_name(role) if isinstance(role, str) else role
        self.qmark = qmark
        self.exclam = exclam
        self.intj = intj
        self.neg = neg
        self.sub = DataTuple(sub)

    def __new__(cls, *args: Any, **kwds: Any) -> None:
        obj = super().__new__(cls)
        obj.__init__(*args, **kwds)
        if (cur := obj.sent.cmap.get(obj.idx)):
            data = { k: v for k, v in obj.data.items() if k in cur.slot_names }
            cur.__init__(**data)
            return cur
        obj.sent.cmap[obj.idx] = obj
        obj.sent.pmap[obj.idx] = obj.phrase
        return obj

    def __getitem__(self, idx: int | slice) -> Token | tuple[Token, ...]:
        if isinstance(idx, int):
            return self.doc[self.tid[idx]]
        return tuple(self.doc[i] for i in self.tid)

    def __len__(self) -> int:
        return len(self._tid)

    def __init_subclass__(cls) -> None:
        super().__init_subclass__()
        cls.init_class_attrs({
            "__tokens__": "token_names",
            "__attrs__": "attr_names"
        })
        if "tok" in cls.__tokens__:
            raise TypeError("'tok' cannot be declared in '__tokens__'")
        tags = getattr(cls, "__tags__", None)
        # Build role map ------------------------------------------------------
        role = getattr(cls, "__role__", None)
        if role:
            if (t := cls.roles.get(role)) \
            and (tpath := t.ppath()) != cls.ppath():
                raise TypeError(f"'{role}' role already assigned to '{tpath}'")
            cls.roles[role] = cls
        if tags:
            tags = [ POS.from_name(tag) for tag in tags.name.split("|") ]
            for pos in tags:
                cur_type = cls.roles.get(pos)
                # pylint: disable=comparison-with-callable
                if cur_type and cur_type.ppath() != cls.ppath():
                    raise TypeError(
                        "cannot map multiple component "
                        f"classes to '{pos.name}' POS tag"
                    )
                cls.roles[pos] = cls

    # Properties --------------------------------------------------------------

    @property
    def idx(self) -> int:
        """Index of the component head token."""
        return self.tok.i

    @property
    def head(self) -> Token:
        """Component head token."""
        return self.tok

    @property
    def lead(self) -> Self:
        """Head component of the lead phrase."""
        return self.phrase.lead.head

    @property
    def is_lead(self) -> bool:
        """Is the controlling phrase of the component a lead phrase."""
        return self.phrase.is_lead

    @property
    def conjuncts(self) -> Conjuncts[Self]:
        return (conjs := self.phrase.conjuncts).copy(
            members=tuple(m.head for m in conjs.members)
        )

    @property
    def phrase(self) -> "Phrase":
        if (p := self.sent.pmap.get(self.idx)):
            return p
        return self.types.Phrase.from_component(self)

    @property
    def tid(self) -> tuple[int, ...]:
        if not self._tid:
            self._tid = self.get_tid()
        return self._tid

    @property
    def tokens(self) -> tuple[Token, ...]:
        return tuple(self.doc[i] for i in self.tid)

    @property
    def subtokens(self) -> tuple[Token, ...]:
        return sorted((*self.tokens, *self.sub))

    @property
    def pos(self) -> POS:
        return self.tok.pos

    @property
    def attrs(self) -> dict[str, Any]:
        """Attributes dictionary."""
        dct = {}
        for name in self.attr_names:
            attr = getattr(self, name)
            if isinstance(attr, Symbol):
                attr = attr.name
            dct[name] = attr
        return dct

    # Methods -----------------------------------------------------------------


[docs]
    @classmethod
    def from_data(cls, doc: Doc, data: dict[str, Any]) -> Self:
        """Construct from :class:`~segram.nlp.Doc` and a data dict."""
        data = data.copy()
        alias = data.pop("@class")
        typ = cls.types[alias]
        for name in ("tok", *typ.token_names, "sub"):
            if name not in data:
                continue
            idx = data[name]
            if isinstance(idx, int):
                data[name] =  doc[idx]
            else:
                data[name] = [ doc[i] for i in idx ]
        return typ(**data)



[docs]
    def to_data(self) -> dict[str, Any]:
        """Dump to data dictionary."""
        slots = ("tok", "sub", *self.token_names)
        data = {}
        for name, tok in self.data.items():
            if name not in slots or not tok:
                continue
            if isinstance(tok, Token):
                data[name] = tok.i
            else:
                data[name] = [ t.i for t in tok ]
        return {
            "@class": self.alias,
            **data,
            **self.attrs,
        }



[docs]
    @classmethod
    def get_comp_type(
        cls,
        role: Role = None,
        pos: POS | None = None
    ) -> type[Self]:
        """Get component type from role or POS tag."""
        return cls.roles.get(role, cls.roles.get(pos, cls))



[docs]
    def to_str(self, *, color: bool = False, role: Role | None = None, **kwds: Any) -> str:
        """Represent as a string.

        Parameters
        ----------
        role
            Overrides head token role.
        """
        # pylint: disable=unused-argument
        return " ".join(
            t.to_str(color=color, role=r)
            for t, r in self.iter_token_roles(role=role)
        )



[docs]
    def get_tid(self) -> tuple[int, ...]:
        """Get token tuple id."""
        def _iter():
            for name in ("tok", *self.token_names):
                if (value := getattr(self, name, None)):
                    if isinstance(value, Token):
                        yield value
                    else:
                        yield from value
        return tuple(sorted(t.i for t in _iter()))



[docs]
    def iter_token_roles(
        self,
        *,
        role: Role | None = None,
        bg: bool = False
    ) -> Iterable[tuple[Token, Role | None]]:
        """Iterate over token-role pairs.

        Parameters
        ----------
        role
            Overrides head token role.
        bg
            Should tokens be marked as a background token
            (e.g. as a part of a subclause).
            This is used for graying out subclauses when printing.
        """
        role = role or self.role
        if bg:
            role = Role.BG
        for tok in self.tokens:
            yield tok, role if tok == self.tok else tok.role



[docs]
    def is_comparable_with(self, other: Any) -> None:
        return isinstance(other, Component)



[docs]
    def similarity(self, other: Self | Token) -> float:
        """Cosine similarity to other component."""
        return cosine_similarity(self.vector, other.vector)




# pylint: disable=abstract-method

[docs]
class Verb(Component):
    """Abstract base class for verb components.

    Attributes
    ----------
    neg
        Negation token.

    Notes
    -----
    It defines also ``tense`` attribute.
    """
    __role__ =  Role.VERB
    __tags__ = POS.VERB | POS.AUX
    __attrs__ = ("tense", "modal", "mood")
    __slots__ = (*__attrs__,)
    alias: ClassVar[str] = "Verb"

    def __init__(
        self,
        *args: Any,
        tense: Tense = Tense.PRESENT,
        modal: Modal = Modal.NULL,
        mood: Mood = Mood.REAL,
        **kwds: Any
    ) -> None:
        super().__init__(*args, **kwds)
        self.tense = Tense.from_name(tense) if isinstance(tense, str) else tense
        self.modal = Modal.from_name(modal) if isinstance(modal, str) else modal
        self.mood = Mood.from_name(mood) if isinstance(mood, str) else mood




[docs]
class Noun(Component):
    """Abstract base class for noun components.

    Attributes
    ----------
    mod
        Modifier tokens.
    """
    __role__ = Role.NOUN
    __tags__ = POS.NOUN | POS.PROPN | POS.PRON
    __slots__ = ()
    alias: ClassVar[str] = "Noun"




[docs]
class Prep(Component):
    """Abstract base class for preposition components.

    Attributes
    ----------
    preps
        Chain of subsequent prepositions attached to the head token.
    """
    __role__ = Role.PREP
    __tags__ = POS.ADP
    __tokens__ = ("preps",)
    __slots__ = (*__tokens__,)
    alias: ClassVar[str] = "Prep"

    def __init__(
        self,
        *args: Any,
        preps: Iterable[Token] = (),
        **kwds: Any
    ) -> None:
        super().__init__(*args, **kwds)
        self.preps = tuple(preps)




[docs]
class Desc(Component):
    """Abstract base class for description components.

    Attributes
    ----------
    mod
        Modifier tokens.
    """
    __role__ = Role.DESC
    __tags__ = POS.ADJ | POS.ADV
    __tokens__ = ("mod",)
    __slots__ = (*__tokens__,)
    alias: ClassVar[str] = "Desc"

    def __init__(
        self,
        *args: Any,
        mod: Iterable[Token] = (),
        **kwds: Any
    ) -> None:
        super().__init__(*args, **kwds)
        self.mod = tuple(mod)