Source code for segram.nlp.pipeline.merger

from spacy.tokens import Doc
from spacy.util import filter_spans
from .annotator import Annotator


[docs] class Merger(Annotator): """Merger class for merging standard multitokens found in a given language (such as phrasal verbs in English) as well as multitoken entities. """
[docs] def __call__(self, doc: Doc) -> Doc: """Retokenize document and merge multitokens.""" # matches = self.matcher(doc) spans = [] for _, start, end in self.matcher(doc): span = doc[start:end] spans.append(span) with doc.retokenize() as retokenizer: for span in filter_spans(spans): root = span.root attrs = { "POS": root.pos_, "DEP": root.dep_, "MORPH": str(root.morph), "ENT_ID": root.ent_id_, "ENT_IOB": root.ent_iob_, "ENT_TYPE": root.ent_type_ } retokenizer.merge(span, attrs=attrs) return doc