Source code for segram.grammar.sent

from typing import Any, ClassVar, Self, Mapping
from .conjuncts import PhraseGroup, Conjuncts
from .abc import SentElement
from .components import Component
from .components import Verb, Noun, Prep, Desc
from .phrases import Phrase
from .graph import PhraseGraph
from ..settings import settings
from ..nlp.tokens import Doc, Span, Token
from ..symbols import Role
from ..abc import labelled
from ..utils.misc import sort_map
from ..datastruct import DataTuple


PVType = PhraseGraph[Phrase]
component_ = labelled("component")


[docs] class Sent(SentElement): """Grammar sentence class. Components within a sentence form a directed acyclic graph with connections going from controlling to dependent components. Attributes ---------- cmap Mapping from head token ids to components. pmap Mapping from head tokens ids to phrases. graph Component graph. conjuncts Mapping from lead components to conjunct groups. """ # pylint: disable=too-many-public-methods __slots__ = ("graph", "conjs", "cmap", "pmap") alias = "Sent" component_names: ClassVar[tuple[str, ...]] = () def __init__( self, sent: Span, *, cmap: Mapping[int, Component] | None = None, pmap: Mapping[int, Phrase] | None = None, graph: PhraseGraph[Phrase, tuple[Phrase, ...]] | None = None, conjs: Mapping[int, Conjuncts] | None = None ) -> None: super().__init__(sent) self.cmap = sort_map(cmap or {}) self.pmap = sort_map(pmap or {}) self.graph = graph self.conjs = conjs or {} def __new__(cls, *args: Any, **kwds: Any) -> None: obj = super().__new__(cls) obj.__init__(*args, **kwds) idx = obj.idx cache = obj.doc.smap if (cur := cache.get(idx)): cur.__init__(**obj.data) return cur cache[idx] = obj return obj def __len__(self) -> int: return len(self.sent) # Properties -------------------------------------------------------------- @property def root(self) -> Component: """Root component.""" return self.cmap[super().root.i] @property def proots(self) -> Conjuncts[Phrase]: """Root phrases.""" return self.root.phrase.group @property def sources(self) -> PVType: return PhraseGroup(self.graph.sources) @property @component_ def verbs(self) -> DataTuple[Verb]: return self.components.filter(lambda c: isinstance(c, Verb)).tuple @property @component_ def nouns(self) -> DataTuple[Noun]: return self.components.filter(lambda c: isinstance(c, Noun)).tuple @property @component_ def preps(self) -> DataTuple[Verb]: return self.components.filter(lambda c: isinstance(c, Prep)).tuple @property @component_ def descs(self) -> DataTuple[Verb]: return self.components.filter(lambda c: isinstance(c, Desc)).tuple @property def tokens(self) -> DataTuple[Token]: return DataTuple(tuple(self.sent)) @property def components(self) -> DataTuple[Component]: return DataTuple(self.cmap.values()) @property def phrases(self) -> PVType: return PhraseGroup(self.pmap.values()) @property def coverage(self) -> float: return sum(1 for _ in self.iter_token_roles()) / len(self.sent) # Methods -----------------------------------------------------------------
[docs] def similarity(self, *args: Any, **kwds: Any) -> float: """Structured similarity with respect to other sentence or phrase.""" return self.Similarity(self, *args, **kwds).similarity
[docs] @classmethod def from_data(cls, doc: Doc, data: dict[str, Any]) -> Self: """Construct from a :class:`~segram.nlp.Doc` and a data dictionary.""" # pylint: disable=protected-access data = data.copy() sent = doc[data.pop("start"):data.pop("end")] data["cmap"] = { idx: cls.types.Component.from_data(doc, dct) for idx, dct in data["cmap"].items() } data["pmap"] = { idx: cls.types.Phrase.from_data(doc, dct) for idx, dct in data["pmap"].items() } data["graph"] = PhraseGraph.from_data(sent, data["graph"]) data["conjs"] = { (conj := Conjuncts.from_data(sent, c)).lead.idx: conj for c in data["conjs"] } return cls(sent, **data)
[docs] def to_data(self) -> dict[str, Any]: """Dump to data dictionary.""" return { "start": self.start, "end": self.end, "cmap": { idx: c.to_data() for idx, c in self.cmap.items() }, "pmap": { idx: p.to_data() for idx, p in self.pmap.items() }, "graph": self.graph.to_data(), "conjs": [ c.to_data() for c in self.conjs.values() ] }
[docs] def iter_token_roles(self) -> tuple[Token, Role | None]: """Iterate over token-role pairs.""" def _iter(): seen = set() for comp in self.components: for tok in comp.subtokens: if tok not in seen: seen.add(tok) yield tok, comp.role yield from sorted(_iter(), key=lambda x: x[0])
[docs] def to_str(self, *, color: bool = False, **kwds: Any) -> str: """Represent as a string.""" # pylint: disable=unused-argument s = "" for tok, role in self.iter_token_roles(): s += tok.to_str(color=color, role=role)+tok.whitespace return s
[docs] def is_comparable_with(self, other: Any) -> None: return isinstance(other, Sent)
[docs] @staticmethod def check_sent(span: Span) -> bool: """Check if a span is a proper sentence.""" if span != span[0].sent: raise ValueError("'span' must be a proper sentence")
[docs] def print(self) -> None: """Pretty print summary.""" # pylint: disable=too-many-branches,not-an-iterable msg = settings.printer.get() print(msg.color(self.sent.text, bold=True), end="\n") for field in (*self.component_names, "graph", "conjs"): vals = getattr(self, field) if field in ("start", "end"): continue if field == "nouns": head = "Noun components" elif field == "verbs": head = "Verb components" elif field == "preps": head = "Prepositional components" elif field == "descs": head = "Descriptive components" elif field == "conjs": head = "Conjunct groups" else: head = "Component structure" if vals: print(msg.divider(head)) if field == "graph": vals.print() elif field == "conjs": for obj in vals.values(): print(obj) else: for comp in vals: print(f"{comp.idx}:", comp)