Source code for segram.grammar.doc

# pylint: disable=no-name-in-module
from typing import Any, Self, Mapping
import numpy as np
from spacy.tokens import Doc as SpacyDoc, Token
from spacy.vocab import Vocab
from .abc import DocElement
from .sent import Sent
from .phrases import Phrase
from .components import Component
from ..nlp.tokens import Doc as DocNLP
from .. import __title__
from ..utils.misc import sort_map
from ..datastruct import DataIterator, DataTuple


[docs] class Doc(DocElement): """Grammar document class. This is grammar equivalent of NLP documents. Attributes ---------- doc Underlying NLP document. smap Mapping from sentence ids to sentences. """ __slots__ = ("smap",) alias = "Doc" def __init__( self, doc: DocNLP | SpacyDoc, smap: Mapping[tuple[int, int], Sent] | None = None ) -> None: alias = getattr(doc._, __title__+"_alias") if isinstance(doc, SpacyDoc): doc = getattr(doc._, alias+"_sns") setattr(doc._, alias+"_doc", self) super().__init__(doc) if smap is None: self.smap = {} # Little trick to make 's.grammar' work smap = { sent.idx: sent for s in doc.sents if (sent := s.grammar).text.strip() and sent.is_correct } self.smap = sort_map(smap) # Properties -------------------------------------------------------------- @property def sents(self) -> DataTuple[Sent]: """Sentences in the document.""" return DataTuple(self.smap.values()) @property def phrases(self) -> DataIterator[Phrase]: """Phrase in the document grouped by sentences and conjunct groups.""" return DataIterator(s.phrases for s in self.sents).flat @property def components(self) -> DataIterator[Component]: """Unique components by sentences.""" return DataIterator(s.components for s in self.sents).flat @property def tokens(self) -> DataTuple[Token]: return DataTuple(self.doc) @property def has_vectors(self) -> bool: """Check if document is equiped with word vectors.""" return self.doc.has_vectors @property def vocab(self) -> Vocab: return self.doc.vocab @property def vector(self) -> np.ndarray[tuple[int], np.floating]: return self.doc.vector # Methods -----------------------------------------------------------------
[docs] def is_comparable_with(self, other: Any) -> bool: return isinstance(other, Doc)
[docs] def to_str(self, **kwds: Any) -> str: """Represent as string.""" return " ".join(s.to_str(**kwds) for s in self.sents)
[docs] def to_data(self) -> dict[str, Any]: """Dump to data dictionary. Parameters ---------- grammar Should grammar data be serialized too. """ return { s.idx: s.to_data() for s in self.sents }
[docs] def copy(self) -> Self: # pylint: disable=arguments-differ return self.from_data(self.doc.doc, self.to_data())
[docs] @classmethod def from_data(cls, doc: DocNLP, data: dict[str, Any]) -> Self: """Construct from NLP documet and data dictionary.""" smap = getattr(doc._, f"{doc.alias}_data") doc = cls(doc) for idx, dct in smap.items(): doc.smap[idx] = doc.types.Sent.from_data(doc, dct) return doc
[docs] @classmethod def from_doc(cls, doc: DocNLP, *args: Any, **kwds: Any) -> Self: """Construct from NLP document object.""" if isinstance(doc, SpacyDoc): doc = getattr(doc._, doc.alias+"_sns") typ = doc.get_grammar_type() return typ.types.Doc(doc, *args, **kwds)