Source code for segram.grammar.conjuncts

from typing import Any, Iterable, Self
from ..nlp.tokens import Token
from ..datastruct import DataTuple
from ..utils.misc import stringify


[docs] class PhraseGroup(DataTuple): """Group of phrases.""" @property def conjs(self) -> DataTuple["Conjuncts"]: """Phrases groups as conjuncts.""" return Conjuncts.get_chain(self)
[docs] class Conjuncts(DataTuple): """Group of conjoined phrases. Attributes ---------- members Conjoined phrases. lead Lead component. cconj Conjunction token. preconj Preconjunction token. """ __cconjs__ = ("cconj", "preconj") def __init__( self, members: Iterable["Phrase"] = (), *, lead: int = 0, cconj: Token | None = None, preconj: Token | None = None ) -> None: # pylint: disable=unused-argument super().__init__() self._lead = lead self.cconj = cconj self.preconj = preconj def __repr__(self) -> str: return self.to_str(color=True) def __hash__(self) -> int: return hash(self.hashdata) # Properties -------------------------------------------------------------- @property def members(self) -> tuple["Phrase", ...]: return tuple(self) @property def lead(self) -> Any: return self.members[self._lead] @property def cconjs(self) -> tuple[Any, ...]: return tuple(getattr(self, name) for name in self.__cconjs__) @property def hashdata(self) -> tuple[Any, ...]: return (tuple(self), self._lead, tuple(self.cconjs)) @property def data(self) -> dict[str, any]: return { "members": self.members, "lead": self.lead, "cconj": self.cconj, "preconj": self.preconj } # Methods -----------------------------------------------------------------
[docs] @classmethod def from_data( cls, sent: "Sent", data: dict[str, int | list[int] | None ], ) -> Self: """Construct from data dictionary. Parameters ---------- sent Sentence object. data Data dictionary. cdict Mapping from ordinal numbers to components. """ doc = sent.doc sent = sent.grammar lead = data["lead"] cconj = data.get("cconj") pconj = data.get("preconj") members = [ sent.pmap[m] for m in data["members"] ] if cconj is not None: cconj = doc[cconj] if pconj is not None: pconj = doc[pconj] return cls(members, lead=lead, cconj=cconj, preconj=pconj)
[docs] def to_data(self) -> dict[str, int | list[int] | None]: """Dump to data dictionary. Parameters ---------- odict Mapping from components to their ordinal numbers within the sentence sequence. Returns ------- data Dictionary with list of components ordinal numbers and and index of the conjunction token, or ``None``. """ return { "members": [ comp.idx for comp in self.members ], "lead": self._lead, "cconj": self.cconj.i if self.cconj else None, "preconj": self.preconj.i if self.preconj else None }
def to_str(self, *, color: bool = False, **kwds: Any) -> str: coords = \ "|".join( stringify(c, color=color, **kwds) for c in self.cconjs if c ).strip() if coords: coords = f"[{coords}]" members = ", ".join(stringify(m, color=color, **kwds) for m in self.members) return f"{coords}({members})" def is_comparable_with(self, other: Any) -> bool: return isinstance(other, Conjuncts)
[docs] @classmethod def find_groups(cls, phrases: Iterable["Phrase"]) -> Iterable[Self]: """Find conjuncts groups in ``phrases``.""" # pylint: disable=protected-access groups = {} for phrase in phrases: groups.setdefault(phrase.group.lead.idx, []).append(phrase) for lead_idx, group in groups.items(): if not group: continue if len(group) == 1: yield Conjuncts(group) else: yield group[0].sent.conjs[lead_idx].copy(members=group)
[docs] @classmethod def get_chain(cls, phrases: Iterable["Phrase"]) -> DataTuple["Conjuncts"]: """Get chain of conjuncts groups in ``phrases``.""" return DataTuple(cls.find_groups(phrases))
def copy(self, **kwds: Any) -> Self: kwds = { **self.data, **kwds } members = kwds.pop("members", ()) return self.__class__(members, **kwds)
# class PhraseGroup(DataChain): # """Phrase group class. # This is a chain of groups of conjoined phrases # enhanced with several methods for matching, grouping, # summarizing and aggregating information from phrases. # """ # __slots__ = () # def __init__(self, members: Iterable[DataTuple] = ()) -> None: # members = DataTuple( # Conjuncts(m) if not isinstance(m, Conjuncts) else m # for m in members # ) # super().__init__(members) # def match( # self, # *args: Any, # require: Callable[Iterable["Phrase"], bool] = any, # **kwds: Any # ) -> bool: # """Match phrase group against a specification. # Parameters # ---------- # *args, **kwds # Passed to :meth:`segram.grammar.Phrase`. # require # Function deciding whether the phrase group # after filtering satisfies the requirements. # """ # return require(p.match(*args, **kwds) for p in self) # def group_by_doc(self) -> dict[str, PhraseGroup]: # """Group by documents.""" # data = {} # for group in self.members: # data.setdefault(id(group.lead.doc), []).append(group) # final = {} # for v in data.values(): # final[v[0].lead.doc.id] = self.__class__(sorted(v)) # return final # def get_conjuncts(self) -> Self: # """Get non-trivial conjunct groups.""" # return self.__class__([ # m for m in self.members if len(m) > 1 # ]) # def group_by_head( # self, # *parts, # lemmatize: bool = True, # coref: bool = True, # pos: bool = True, # ent: bool = True, # lexeme: bool = True # ) -> dict[str, PhraseGroup]: # """Group by phrases by head tokens. # Parameters # ---------- # *parts # Names of the parts (e.g. ``"subj"`` or ``"xcomp"``) # to use. Use all parts if ``None``. # lemmatize # Lemmatize token texts used as keys. # coref # Resolve coreferences (to the leading ref) # for use as keys. # pos # Add POS tags to keys. # ent # Add entity types to keys. # lexeme # Add ``"lexeme"`` field storing # lexeme objects corresponding to tokens. # """ # # pylint: disable=too-many-locals # data = {} # for phrase in self: # tok = phrase.head.tok # if coref: # tok = tok.coref # key = tok.lemma if lemmatize else tok.text # if pos or ent: # key = (key,) # if pos: # key = (*key, tok.pos) # if ent: # key = (*key, tok.ent) # data \ # .setdefault(key, {}) \ # .setdefault("phrases", []).append(phrase) # rec = data[key] # if lexeme and (lkey := "lexeme") not in rec \ # and (vocab := getattr(tok, "vocab", None)): # rec[lkey] = vocab[key[0] if isinstance(key, tuple) else key] # for name in phrase.part_names: # if parts and name not in parts: # continue # for part in getattr(phrase, name, ()): # rec.setdefault(name, []).append(part) # for key in data: # data[key] = { k: v for k, v in data[key].items() if v } # return data