"""Base Segram module.
It implements the _Segram_ pipe component providing
all main semantic grammar transformations and related auxiliary methods.
"""
from typing import Any, Sequence
from importlib import import_module
import spacy
from spacy.tokens import Doc
from spacy.language import Language
from spacy.pipeline.pipe import Pipe
from ..extensions import SpacyExtensions
from ... import __title__, __version__
from ...utils.meta import get_cname
from ...utils.registries import models as models_registry
[docs]
class Segram(Pipe):
"""Semantic grammar pipeline component.
It extends :mod:`spacy` token classes with semantic
grammar methods and related functionalities such as
custom preprocessing (e.g. merging and corrected lemmatization).
Attributes
----------
nlp
Language model object.
name
Name of the component.
extensions
Module defining custom :mod:`spacy` extensions.
grammar
Label of grammar implementation.
meta
Metadata dictionary with details on :mod:`spacy`
and `segram` models being used.
"""
[docs]
def __init__(
self,
nlp: Language,
name: str,
*,
grammar: str,
preprocess: Sequence[str],
alias: str = __title__,
vectors: str | Language | None = None,
store_data: bool = True
) -> None:
"""Initialization method.
Parameters
----------
preprocess
List of :mod:`segram` pipeline components to use for preprocessing
documents before applying the main :mod:`segram` pipe.
If ``None`` then all available preprocessing components are used.
alias
Set ``spacy_alias`` in the global settings.
It is used for namespacing extension attributes added
by :mod:`segram` in order to avoid collision with other
packages.
vectors
Vector table to use instead of the vectors provided by the main
model. Must be provided by the name of a model or the model
object itself, so the it is possible to keep track of the model
name.
store_data
Should document data be stored automatically at the time of parsing.
"""
if not alias:
raise ValueError(
f"'{get_cname(self)}' must define non-empty 'alias' "
"for naming and prefixing Spacy extension attributes"
)
self.alias = alias
self.nlp = nlp
self.name = name
self.extensions = self.import_extensions(grammar, nlp.lang, alias)
self.grammar = f"{grammar}.{nlp.lang}"
self.store_data = store_data
if isinstance(vectors, str):
vectors = spacy.load(vectors, enable="tok2vec", vocab=nlp.vocab)
elif vectors:
vcn = vectors.__class__.__name__
raise ValueError(f"'vectors' must be provided as a language model or a name, not '{vcn}'")
models_registry.register(self.get_model_name(nlp), func=nlp)
self.meta = {
"name": self.name,
__title__+"_alias": alias,
__title__+"_version": __version__,
__title__+"_grammar": f"{grammar}.{nlp.lang}",
"spacy_version": spacy.__version__,
"model": self.get_model_info(nlp),
"vectors": self.get_model_info(vectors) if vectors else None
}
self.configure_pipeline(*preprocess)
self.init_extensions()
def __call__(self, doc: Doc) -> Doc:
self.set_docattrs(doc, self.alias, self.meta)
if self.store_data:
data = getattr(doc._, self.alias).to_data()
setattr(doc._, f"{self.alias}_data", data)
return doc
# Properties --------------------------------------------------------------
@property
def id(self) -> int:
"""Hash id of the component."""
hashdata = []
for k, v in self.meta.items():
if isinstance(v, dict):
v = tuple(v.items())
hashdata.append((k, v))
return hash(tuple(hashdata))
# Methods -----------------------------------------------------------------
[docs]
@staticmethod
def set_docattrs(doc: Doc, alias: str, meta: dict[str, Any]) -> None:
"""Set document attributes."""
meta = meta.copy()
setattr(doc._, __title__+"_alias", alias)
setattr(doc._, f"{alias}_meta", meta)
[docs]
@staticmethod
def import_extensions(
grammar: str,
lang: str,
alias: str
) -> SpacyExtensions:
"""Import NLP module from grammar label and language code.
Returns
-------
extensions
:class:`~segram.nlp.extensions.SpacyExtensions` instance.
"""
path = f"{__title__}.nlp.backend.{grammar}.lang.{lang}"
module = import_module(path)
kwds = { "alias": alias }
for tok_type in ("Doc", "Span", "Token"):
try:
kwds[tok_type.lower()] = getattr(module, tok_type)
except AttributeError as exc:
raise AttributeError(
f"module does not define nor import '{tok_type}' class; "
"'spacy' backends must provide enhanced "
"'Doc', 'Span' and 'Token' classes"
) from exc
return SpacyExtensions(**kwds)
[docs]
def init_extensions(self) -> None:
"""Initialize custom :mod:`spacy` attributes."""
self.extensions.register()
[docs]
def normalize_pipe_name(self, pipe: str) -> str:
"""Normalize pipeline component name."""
if "." in pipe:
_, pipe = pipe.split(".")
prefix = self.alias+"_"
if not pipe.startswith(prefix):
pipe = prefix + pipe
return pipe
[docs]
def get_config(self) -> dict:
"""Get current config dictionary."""
return self.nlp.config["components"][self.name].copy()
[docs]
@staticmethod
def get_model_name(nlp: Language) -> str:
"""Get language model name."""
return f"{nlp.meta['lang']}_{nlp.meta['name']}"
[docs]
@staticmethod
def get_model_info(nlp: Language) -> str:
"""Get language model information."""
return {
"lang": nlp.meta["lang"],
"name": nlp.meta["name"],
"version": nlp.meta["version"],
"description": nlp.meta["description"]
}