Source code for segram.nlp.tokens.doc

from typing import Any, Iterable, Self
import json
from murmurhash import hash_unicode
from spacy.tokens import Doc as SpacyDoc
from spacy.tokens import Span as SpacySpan
from spacy.tokens import Token as SpacyToken
from .abc import NLP
from .token import Token
from .span import Span
from ... import __title__
from ...utils.registries import grammars
from ...utils.diff import iter_diffs, equal, IDiffType


[docs] class Doc(NLP): """Enhanced document class.""" __slots__ = ("_id",) def __init__(self, *args: Any, **kwds: Any) -> None: super().__init__(*args, **kwds) self._id = None def __hash__(self) -> int: return super().__hash__() def __eq__(self, other: Self) -> bool: if isinstance(other, Doc): return self.tok == other.tok return NotImplemented def __iter__(self) -> Iterable[Token]: for tok in self.tok: yield self.sns(tok) def __len__(self) -> int: return len(self.tok) def __getitem__(self, idx: int | slice) -> Token | Span: return self.sns(self.tok[idx]) def __contains__(self, other: Token | SpacyToken | Span | SpacySpan) -> bool: if isinstance(other, SpacyToken | SpacySpan): return other in self.tok if isinstance(other, Token | Span): return other.tok in self.tok ocn = other.__class__.__name__ scn = self.__class__.__name__ raise TypeError(f"'{scn}' cannot contain '{ocn}' objects") # Properties -------------------------------------------------------------- @property def doc(self) -> Self: return self @property def lang(self) -> str: return self.tok.lang_ @property def id(self) -> int: """Hash id of the document tokenization.""" if self._id is None: string = json.dumps( self.coredata, check_circular=False, indent=None, separators=(",", ":"), sort_keys=True ) self._id = hash_unicode(string) return self._id @property def coredata(self) -> dict[str, Any]: fields = ( "words", "spaces", "tags", "pos", "morphs", "lemmas", "heads", "deps", "ents" ) meta = getattr(self._, f"{self.alias}_meta").copy() data = { k: v for k, v in self.data.items() if k in fields } return { "meta": meta, "data": data } @property def noun_chunks(self) -> Iterable[Span]: for chunk in self.tok.noun_chunks: yield self.sns(chunk) @property def sents(self) -> Iterable[Span]: for sent in self.tok.sents: if sent.text.strip(): yield self.sns(sent) @property def data(self) -> dict[str, Any]: return self.to_data() @property def grammar(self) -> "Doc": alias = self.alias if (doc := getattr(self._, f"{alias}_doc")): return doc typ = self.get_grammar_type() if (data := getattr(self._, f"{alias}_data")): return typ.types.Doc.from_data(self, data) return typ.types.Doc(self) # Methods -----------------------------------------------------------------
[docs] @staticmethod def clear_user_data(user_data: dict): """Clear user data from cached :mod:`segram` objects.""" alias = user_data[("._.", __title__+"_alias", None, None)] _alias = "_"+alias+"_sns" for k, v in user_data.items(): user_data[k] = v if _alias not in k else None user_data[("._.", f"{alias}_doc", None, None)] = None return user_data
[docs] def to_data(self) -> dict[str, Any]: """Dump to data dictionary sufficient to recreate simple document without any language model data. """ data = { "vocab": self.vocab, "words": [ t.text for t in self ], "spaces": [ t.whitespace for t in self ], "tags": [ t.tag_ for t in self.tok ], "pos": [ t.pos_ for t in self.tok ], "morphs": [ str(t.morph) for t in self.tok ], "lemmas": [ t.lemma_ for t in self.tok ], "heads": [ t.head.i for t in self.tok ], "deps": [ t.dep_ for t in self.tok ], "ents": [ f"{t.ent_tag}" for t in self ] } data["user_data"] = self.clear_user_data(self.tok.user_data.copy()) return data
[docs] @classmethod def from_data(cls, data: dict[str, Any]) -> Self: """Construct from data dictionary produced by :meth:`to_data`.""" alias = data["user_data"][("._.", __title__+"_alias", None, None)] return getattr(SpacyDoc(**data)._, alias+"_sns")
def char_span(self, *args: Any, **kwds: Any) -> Span | None: res = self.tok.char_span(*args, **kwds) return res if res is None else self.sns(res) @classmethod def from_docs(cls, *args: Any, **kwds: Any) -> Self | None: res = Doc.from_docs(*args, **kwds) return res if res is None else cls.sns(res) def get_grammar_type(self): alias = self.alias key = getattr(self._, f"{alias}_meta")[f"{alias}_grammar"] return grammars.get(key) def copy(self) -> Self: return self.from_data(self.to_data())
# Register comparison functions for testing ----------------------------------- @equal.register def _(obj: Doc, other: Doc, *, strict: bool = True) -> bool: return ((strict and obj == other) or (not strict and obj.id == other.id)) @iter_diffs.register def _(obj: Doc, other: Doc, *, strict: bool = True) -> IDiffType: if not equal(obj, other, strict=strict): msg = "DOCUMENT CONTENT" if obj.id == other.id: msg = "DOCUMENT TYPE" yield msg, obj, other