Source code for segram.nlp.corpus

# pylint: disable=no-name-in-module
from typing import Any, Iterable, Self, Literal, Mapping
import os
import pickle
from collections import Counter
from importlib import import_module
from spacy.tokens import Doc as SpacyDoc, DocBin
from spacy.language import Language
from spacy.vocab import Vocab
from tqdm.auto import tqdm
from .tokens import Doc, Token
from ..datastruct import DataIterator, DataTuple
from ..nlp.pipeline.base import Segram
from ..utils.misc import prefer_gpu_vectors, ensure_cpu_vectors
from .. import __title__



[docs]
class Corpus(Mapping):
    """Corpus class.

    Attributes
    ----------
    token_dist
        Token distribution.
    count
        Count raw words, lowercased words or lemmas.
    resolve_coref
        If ``True`` then token coreferences are resolved when
        calculating token text and lemma frequency distributions.
    """
    _count_vals = ("words", "lower", "lemmas")
    _attrs = (
        "HEAD", "TAG", "POS", "DEP", "LEMMA",
        "MORPH", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID"
    )

    def __init__(
        self,
        vocab: Vocab,
        nlp: Language | None = None,
        *,
        count_method: Literal[*_count_vals] = "lemmas",
        resolve_coref: bool = True
    ) -> None:
        self._check_count_method(count_method)
        self._dmap = {}
        self.vocab = vocab
        self.nlp = nlp
        self.token_dist = Counter()
        self.count_method = count_method
        self.resolve_coref = resolve_coref
        self.meta = None

    def __getitem__(self, key: int) -> Doc:
        return self._dmap[key]

    def __len__(self) -> int:
        return len(self._dmap)

    def __iter__(self) -> int:
        yield from self._dmap

    def __contains__(self, doc: Doc) -> bool:
        if isinstance(doc, Doc):
            return hash(doc) in self._dmap
        cn = self.__class__.__name__
        dn = doc.__class__.__name__
        raise NotImplementedError(f"'{cn}' cannot contain '{dn}' objects")

    def __repr__(self) -> str:
        cn = self.__class__.__name__
        ndoc = len(self)
        count = self.count_method
        at = hex(id(self))
        dword = "doc" if ndoc == 1 else "docs"
        return f"<{cn} with {ndoc} {dword} and count_method=\"{count}\" at {at}>"

    # Properties --------------------------------------------------------------

    @property
    def docs(self) -> DataIterator[Doc]:
        return DataIterator(self._dmap.values())

    # Methods -----------------------------------------------------------------


[docs]
    def add_doc(self, doc: Doc | str) -> None:
        """Add document to the corpus.

        The method recognizes identical documents
        and do not add the same ones more than once.
        The identity check is based on :meth:`segram.nlp.Doc.id`.

        See also
        --------
        segram.nlp.Doc.id : persistent document identifier.
        segram.nlp.Doc.coredata : data used to generate the identifier.

        Raises
        ------
        AttributeError
            If a language model is not defined under the attribute
            ``self.nlp``.
        """
        if isinstance(doc, str):
            if not self.nlp:
                raise AttributeError(
                    "corpus has been initialized without language model, ",
                    "so documents passed as strings cannot be parsed."
                )
            doc = self.nlp(doc)
        alias = getattr(doc._, __title__+"_alias")
        if not self.meta:
            self.meta = getattr(doc._, alias+"_meta")
        if isinstance(doc, SpacyDoc):
            doc = getattr(doc._, alias+"_sns")
        if doc not in self:
            self._dmap[doc.id] = doc.grammar
            self.token_dist += self._count_toks(doc)



[docs]
    def add_docs(
        self,
        docs: Iterable[Doc | str],
        *,
        progress: bool = False,
        **kwds: Any
    ) -> None:
        """Add documents to the corpus.

        ``**kwds`` are passed to :func:`tqdm.tqdm` with
        ``progress`` used to switch the progress bar
        (i.e. it is used as ``disable=not progress``).
        """
        for doc in tqdm(docs, disable=not progress, **kwds):
            self.add_doc(doc)



[docs]
    def count_tokens(self, what: Literal[_count_vals]) -> None:
        """(Re)count tokens.

        ``what`` specifies what kind of tokens should be counted.
        Recount is done only when necessary, i.e. when the call
        changes the previous count_method method.
        """
        self._check_count_method(what)
        if what != self.count_method:
            self.count_method = what
            self.token_dist = Counter()
            for doc in self:
                self.token_dist += self._count_toks(doc)



[docs]
    def copy(self) -> Self:
        """Make a copy.

        Language model object is passed but not copied.
        Document objects are copied.
        """
        # pylint: disable=protected-access
        kwds = {
            "count_method": self.count_method,
            "resolve_coref": self.resolve_coref
        }
        obj = self.__class__(self.nlp, **kwds)
        obj._dmap = { idx: doc.copy() for idx, doc in self._dmap.items() }
        obj.token_dist = self.token_dist.copy()
        return obj



[docs]
    def get_docbin(
        self,
        attrs: Iterable[str] = _attrs,
        user_data: bool = True
    ) -> DocBin:
        """Get documents packed as :class:`spacy.tokens.DocBin`.

        Parameters
        ----------
        attrs
            Token attributes to serialize.
        user_data
            Should user data be stored.
            Setting to ``True`` requires clearing the cached grammar
            objects linked to all tokens, spans and docs to allow for
            serialization. This does not affect any functionalities
            of existing documents, but temporarily affects performance
            as the cache must be first reconstructed during further use.
        """
        if user_data:
            for doc in self.docs:
                Doc.clear_user_data(doc.doc.tok.user_data)
        docs = self.docs.get("doc").get("tok")
        dbin = DocBin(attrs, store_user_data=user_data, docs=docs)
        return dbin



[docs]
    def ensure_cpu_vectors(self) -> None:
        """Ensure that word vectors are stored on CPU."""
        ensure_cpu_vectors(self.vocab)



[docs]
    def prefer_gpu_vectors(self, *args: Any, **kwds: Any) -> bool:
        """Put word vectors on GPU if possible.

        Arguments are passed to :func:`segram.utils.misc.prefer_gpu_vectors`.
        """
        prefer_gpu_vectors(self.vocab, *args, **kwds)



[docs]
    @classmethod
    def from_texts(
        cls,
        nlp: Language,
        *texts: str,
        pipe_kws: dict[str, Any] | None = None,
        progress: bool = False,
        tqdm_kws: dict[str, Any] | None = None,
        **kwds: Any
    ) -> Self:
        """Construct from texts.

        Parameters
        ----------
        nlp
            Language model to use to parse texts.
        *texts
            Texts to parse.
        pipe_kws
            Keyword arguments passed to :meth:`spacy.language.Language.pipe`.
        **kwds
            Passed :meth:`__init__`.
            Vocabulary is taken from the language model.
        """
        obj = cls(nlp.vocab, nlp, **kwds)
        pipe_kws = pipe_kws or {}
        tqdm_kws = tqdm_kws or {}
        obj.add_docs(nlp.pipe(texts, **pipe_kws), progress=progress, **tqdm_kws)
        return obj



[docs]
    def to_data(
        self,
        *,
        vocab: bool = True,
        nlp: bool = False
    ) -> dict[str, Any]:
        """Dump to data dictionary.

        Parameters
        ----------
        vocab
            Should ``self.vocab`` be used.
        nlp
            Should ``self.nlp`` be used.
        """
        data = {
            "token_dist": dict(self.token_dist),
            "count_method": self.count_method,
            "resolve_coref": self.resolve_coref,
            "meta": self.meta
        }
        if vocab:
            data["vocab"] = self.vocab.to_bytes()
        if nlp and self.nlp:
            data["nlp"] = {
                "module": self.nlp.__class__.__module__,
                "name": self.nlp.__class__.__name__,
                "data": self.nlp.to_bytes()
            }
        if self._dmap:
            data["docs"] = self.get_docbin().to_bytes()
        return data



[docs]
    @classmethod
    def from_data(cls, data: dict[str, Any], **kwds: Any) -> Self:
        """Construct from data dictionary.

        ``**kwds`` are passed to :meth:`add_docs`.
        """
        # pylint: disable=no-value-for-parameter
        meta = data.pop("meta")
        grammar, lang = meta["segram_grammar"].split(".")
        alias = meta["segram_alias"]
        Segram.import_extensions(grammar, lang, alias).register()
        vocab = data["vocab"]
        if not isinstance(vocab, Vocab):
            vocab = Vocab().from_bytes(vocab)
        data["vocab"] = vocab
        if (nlp := data.get("nlp")):
            if not isinstance(nlp, Language):
                dct = nlp
                nlp = getattr(import_module(dct["module"]), dct["name"])()
                nlp = nlp.from_bytes(dct["data"])
            data["nlp"] = nlp
        total = None
        if (docs := data.pop("docs", ())):
            docbin = DocBin().from_bytes(docs)
            total = len(docbin)
            docs = docbin.get_docs(vocab)
        token_dist = Counter(data.pop("token_dist"))
        corpus = cls(**data)
        corpus.meta = meta
        corpus.token_dist = token_dist
        corpus.add_docs(docs, **{ "total": total, **kwds })
        return corpus



[docs]
    def to_disk(
        self,
        path: str | bytes | os.PathLike,
        **kwds: Any
    ) -> None:
        """Save to disk.

        ``**kwds`` are passed to :meth:`to_data`.
        """
        with open(path, "wb") as fh:
            pickle.dump(self.to_data(**kwds), fh)



[docs]
    @classmethod
    def from_disk(
        cls,
        path: str | bytes | os.PathLike,
        *,
        vocab: Vocab | bytes | None = None,
        nlp: Language | bytes | None = None,
        **kwds: Any
    ) -> Self:
        """Construct from disk.

        Parameters
        ----------
        nlp, vocab
            Use ``vocab`` and ``nlp`` to pass an arbitrary vocabulary
            and/or language model for initializing corpus. Useful when a corpus
            has been saved to disk with ``vocab=False`` and/or ``nlp=False``.
        **kwds
            Passed to :meth:`from_data`.
        """
        with open(path, "rb") as fh:
            data = pickle.load(fh)
            if vocab:
                data["vocab"] = vocab
            if nlp:
                data["nlp"] = nlp
            return cls.from_data(data, **kwds)


    # Internals ---------------------------------------------------------------

    def _count_toks(self, toks: Iterable[Token]) -> Counter:
        toks = DataTuple(toks)
        if self.resolve_coref:
            toks = toks.get("coref")
        if self.count_method == "lemma":
            toks = toks.get("lemma")
        else:
            toks = toks.get("text")
            if self.count_method == "lower":
                toks.map(str.lower)
        return Counter(toks)

    def _check_count_method(self, what: str) -> None:
        if what not in self._count_vals:
            raise ValueError(f"'count' has to be one of {self._count_vals}")