Source code for segram.nlp.pipeline.base

"""Base Segram module.

It implements the _Segram_ pipe component providing
all main semantic grammar transformations and related auxiliary methods.
"""
from typing import Any, Sequence
from importlib import import_module
import spacy
from spacy.tokens import Doc
from spacy.language import Language
from spacy.pipeline.pipe import Pipe
from ..extensions import SpacyExtensions
from ... import __title__, __version__
from ...utils.meta import get_cname
from ...utils.registries import models as models_registry


[docs] class Segram(Pipe): """Semantic grammar pipeline component. It extends :mod:`spacy` token classes with semantic grammar methods and related functionalities such as custom preprocessing (e.g. merging and corrected lemmatization). Attributes ---------- nlp Language model object. name Name of the component. extensions Module defining custom :mod:`spacy` extensions. grammar Label of grammar implementation. meta Metadata dictionary with details on :mod:`spacy` and `segram` models being used. """
[docs] def __init__( self, nlp: Language, name: str, *, grammar: str, preprocess: Sequence[str], alias: str = __title__, vectors: str | Language | None = None, store_data: bool = True ) -> None: """Initialization method. Parameters ---------- preprocess List of :mod:`segram` pipeline components to use for preprocessing documents before applying the main :mod:`segram` pipe. If ``None`` then all available preprocessing components are used. alias Set ``spacy_alias`` in the global settings. It is used for namespacing extension attributes added by :mod:`segram` in order to avoid collision with other packages. vectors Vector table to use instead of the vectors provided by the main model. Must be provided by the name of a model or the model object itself, so the it is possible to keep track of the model name. store_data Should document data be stored automatically at the time of parsing. """ if not alias: raise ValueError( f"'{get_cname(self)}' must define non-empty 'alias' " "for naming and prefixing Spacy extension attributes" ) self.alias = alias self.nlp = nlp self.name = name self.extensions = self.import_extensions(grammar, nlp.lang, alias) self.grammar = f"{grammar}.{nlp.lang}" self.store_data = store_data if isinstance(vectors, str): vectors = spacy.load(vectors, enable="tok2vec", vocab=nlp.vocab) elif vectors: vcn = vectors.__class__.__name__ raise ValueError(f"'vectors' must be provided as a language model or a name, not '{vcn}'") models_registry.register(self.get_model_name(nlp), func=nlp) self.meta = { "name": self.name, __title__+"_alias": alias, __title__+"_version": __version__, __title__+"_grammar": f"{grammar}.{nlp.lang}", "spacy_version": spacy.__version__, "model": self.get_model_info(nlp), "vectors": self.get_model_info(vectors) if vectors else None } self.configure_pipeline(*preprocess) self.init_extensions()
def __call__(self, doc: Doc) -> Doc: self.set_docattrs(doc, self.alias, self.meta) if self.store_data: data = getattr(doc._, self.alias).to_data() setattr(doc._, f"{self.alias}_data", data) return doc # Properties -------------------------------------------------------------- @property def id(self) -> int: """Hash id of the component.""" hashdata = [] for k, v in self.meta.items(): if isinstance(v, dict): v = tuple(v.items()) hashdata.append((k, v)) return hash(tuple(hashdata)) # Methods -----------------------------------------------------------------
[docs] @staticmethod def set_docattrs(doc: Doc, alias: str, meta: dict[str, Any]) -> None: """Set document attributes.""" meta = meta.copy() setattr(doc._, __title__+"_alias", alias) setattr(doc._, f"{alias}_meta", meta)
[docs] @staticmethod def import_extensions( grammar: str, lang: str, alias: str ) -> SpacyExtensions: """Import NLP module from grammar label and language code. Returns ------- extensions :class:`~segram.nlp.extensions.SpacyExtensions` instance. """ path = f"{__title__}.nlp.backend.{grammar}.lang.{lang}" module = import_module(path) kwds = { "alias": alias } for tok_type in ("Doc", "Span", "Token"): try: kwds[tok_type.lower()] = getattr(module, tok_type) except AttributeError as exc: raise AttributeError( f"module does not define nor import '{tok_type}' class; " "'spacy' backends must provide enhanced " "'Doc', 'Span' and 'Token' classes" ) from exc return SpacyExtensions(**kwds)
[docs] def init_extensions(self) -> None: """Initialize custom :mod:`spacy` attributes.""" self.extensions.register()
[docs] def configure_pipeline(self, *components: str, **kwds: Any) -> None: """Configure secondary :mod:`segram` pipeline components. Parameters ---------- *components Pipeline component names. **kwds Passed to :meth:`~spacy.language.Language.add_pipe`. """ components = tuple( f"{self.alias}_{c}" for c in components if not c.startswith(self.alias+"_") ) pipes = [ self.normalize_pipe_name(pipe) for pipe in components ] for pipe in pipes: if pipe not in self.nlp.pipe_names: self.nlp.add_pipe(pipe, **kwds)
[docs] def normalize_pipe_name(self, pipe: str) -> str: """Normalize pipeline component name.""" if "." in pipe: _, pipe = pipe.split(".") prefix = self.alias+"_" if not pipe.startswith(prefix): pipe = prefix + pipe return pipe
[docs] def get_config(self) -> dict: """Get current config dictionary.""" return self.nlp.config["components"][self.name].copy()
[docs] @staticmethod def get_model_name(nlp: Language) -> str: """Get language model name.""" return f"{nlp.meta['lang']}_{nlp.meta['name']}"
[docs] @staticmethod def get_model_info(nlp: Language) -> str: """Get language model information.""" return { "lang": nlp.meta["lang"], "name": nlp.meta["name"], "version": nlp.meta["version"], "description": nlp.meta["description"] }