Source code for segram.grammar.conjuncts

from typing import Any, Iterable, Self
from ..nlp.tokens import Token
from ..datastruct import DataTuple
from ..utils.misc import stringify



[docs]
class PhraseGroup(DataTuple):
    """Group of phrases."""

    @property
    def conjs(self) -> DataTuple["Conjuncts"]:
        """Phrases groups as conjuncts."""
        return Conjuncts.get_chain(self)




[docs]
class Conjuncts(DataTuple):
    """Group of conjoined phrases.

    Attributes
    ----------
    members
        Conjoined phrases.
    lead
        Lead component.
    cconj
        Conjunction token.
    preconj
        Preconjunction token.
    """
    __cconjs__ = ("cconj", "preconj")

    def __init__(
        self,
        members: Iterable["Phrase"] = (),
        *,
        lead: int = 0,
        cconj: Token | None = None,
        preconj: Token | None = None
    ) -> None:
        # pylint: disable=unused-argument
        super().__init__()
        self._lead = lead
        self.cconj = cconj
        self.preconj = preconj

    def __repr__(self) -> str:
        return self.to_str(color=True)

    def __hash__(self) -> int:
        return hash(self.hashdata)

    # Properties --------------------------------------------------------------

    @property
    def members(self) -> tuple["Phrase", ...]:
        return tuple(self)

    @property
    def lead(self) -> Any:
        return self.members[self._lead]

    @property
    def cconjs(self) -> tuple[Any, ...]:
        return tuple(getattr(self, name) for name in self.__cconjs__)

    @property
    def hashdata(self) -> tuple[Any, ...]:
        return (tuple(self), self._lead, tuple(self.cconjs))

    @property
    def data(self) -> dict[str, any]:
        return {
            "members": self.members,
            "lead": self.lead,
            "cconj": self.cconj,
            "preconj": self.preconj
        }

    # Methods -----------------------------------------------------------------


[docs]
    @classmethod
    def from_data(
        cls,
        sent: "Sent",
        data: dict[str, int | list[int] | None ],
    ) -> Self:
        """Construct from data dictionary.

        Parameters
        ----------
        sent
            Sentence object.
        data
            Data dictionary.
        cdict
            Mapping from ordinal numbers to components.
        """
        doc = sent.doc
        sent = sent.grammar
        lead = data["lead"]
        cconj = data.get("cconj")
        pconj = data.get("preconj")
        members = [ sent.pmap[m] for m in data["members"] ]
        if cconj is not None:
            cconj = doc[cconj]
        if pconj is not None:
            pconj = doc[pconj]
        return cls(members, lead=lead, cconj=cconj, preconj=pconj)



[docs]
    def to_data(self) -> dict[str, int | list[int] | None]:
        """Dump to data dictionary.

        Parameters
        ----------
        odict
            Mapping from components to their ordinal
            numbers within the sentence sequence.

        Returns
        -------
        data
            Dictionary with list of components ordinal numbers
            and and index of the conjunction token, or ``None``.
        """
        return {
            "members": [ comp.idx for comp in self.members ],
            "lead": self._lead,
            "cconj": self.cconj.i if self.cconj else None,
            "preconj": self.preconj.i if self.preconj else None
        }


    def to_str(self, *, color: bool = False, **kwds: Any) -> str:
        coords = \
            "|".join(
                stringify(c, color=color, **kwds)
                for c in self.cconjs if c
            ).strip()
        if coords:
            coords = f"[{coords}]"
        members = ", ".join(stringify(m, color=color, **kwds) for m in self.members)
        return f"{coords}({members})"

    def is_comparable_with(self, other: Any) -> bool:
        return isinstance(other, Conjuncts)


[docs]
    @classmethod
    def find_groups(cls, phrases: Iterable["Phrase"]) -> Iterable[Self]:
        """Find conjuncts groups in ``phrases``."""
        # pylint: disable=protected-access
        groups = {}
        for phrase in phrases:
            groups.setdefault(phrase.group.lead.idx, []).append(phrase)
        for lead_idx, group in groups.items():
            if not group:
                continue
            if len(group) == 1:
                yield Conjuncts(group)
            else:
                yield group[0].sent.conjs[lead_idx].copy(members=group)



[docs]
    @classmethod
    def get_chain(cls, phrases: Iterable["Phrase"]) -> DataTuple["Conjuncts"]:
        """Get chain of conjuncts groups in ``phrases``."""
        return DataTuple(cls.find_groups(phrases))


    def copy(self, **kwds: Any) -> Self:
        kwds = { **self.data, **kwds }
        members = kwds.pop("members", ())
        return self.__class__(members, **kwds)


# class PhraseGroup(DataChain):
#     """Phrase group class.

#     This is a chain of groups of conjoined phrases
#     enhanced with several methods for matching, grouping,
#     summarizing and aggregating information from phrases.
#     """
#     __slots__ = ()

#     def __init__(self, members: Iterable[DataTuple] = ()) -> None:
#         members = DataTuple(
#             Conjuncts(m) if not isinstance(m, Conjuncts) else m
#             for m in members
#         )
#         super().__init__(members)

#     def match(
#         self,
#         *args: Any,
#         require: Callable[Iterable["Phrase"], bool] = any,
#         **kwds: Any
#     ) -> bool:
#         """Match phrase group against a specification.

#         Parameters
#         ----------
#         *args, **kwds
#             Passed to :meth:`segram.grammar.Phrase`.
#         require
#             Function deciding whether the phrase group
#             after filtering satisfies the requirements.
#         """
#         return require(p.match(*args, **kwds) for p in self)

#     def group_by_doc(self) -> dict[str, PhraseGroup]:
#         """Group by documents."""
#         data = {}
#         for group in self.members:
#             data.setdefault(id(group.lead.doc), []).append(group)
#         final = {}
#         for v in data.values():
#             final[v[0].lead.doc.id] = self.__class__(sorted(v))
#         return final

#     def get_conjuncts(self) -> Self:
#         """Get non-trivial conjunct groups."""
#         return self.__class__([
#             m for m in self.members if len(m) > 1
#         ])

#     def group_by_head(
#         self,
#         *parts,
#         lemmatize: bool = True,
#         coref: bool = True,
#         pos: bool = True,
#         ent: bool = True,
#         lexeme: bool = True
#     ) -> dict[str, PhraseGroup]:
#         """Group by phrases by head tokens.

#         Parameters
#         ----------
#         *parts
#             Names of the parts (e.g. ``"subj"`` or ``"xcomp"``)
#             to use. Use all parts if ``None``.
#         lemmatize
#             Lemmatize token texts used as keys.
#         coref
#             Resolve coreferences (to the leading ref)
#             for use as keys.
#         pos
#             Add POS tags to keys.
#         ent
#             Add entity types to keys.
#         lexeme
#             Add ``"lexeme"`` field storing
#             lexeme objects corresponding to tokens.
#         """
#         # pylint: disable=too-many-locals
#         data = {}
#         for phrase in self:
#             tok = phrase.head.tok
#             if coref:
#                 tok = tok.coref
#             key = tok.lemma if lemmatize else tok.text
#             if pos or ent:
#                 key = (key,)
#                 if pos:
#                     key = (*key, tok.pos)
#                 if ent:
#                     key = (*key, tok.ent)
#             data \
#                 .setdefault(key, {}) \
#                 .setdefault("phrases", []).append(phrase)
#             rec = data[key]
#             if lexeme and (lkey := "lexeme") not in rec \
#             and (vocab := getattr(tok, "vocab", None)):
#                 rec[lkey] = vocab[key[0] if isinstance(key, tuple) else key]
#             for name in phrase.part_names:
#                 if parts and name not in parts:
#                     continue
#                 for part in getattr(phrase, name, ()):
#                     rec.setdefault(name, []).append(part)
#         for key in data:
#             data[key] = { k: v for k, v in data[key].items() if v }
#         return data