Source code for segram.utils.misc

# pylint: disable=no-name-in-module
from typing import Any, Callable, Iterable, Mapping
from itertools import product
from more_itertools import unique_everseen
import numpy as np
from numpy.linalg import norm
from spacy.vocab import Vocab


[docs] def cosine_similarity( X: np.ndarray[tuple[int] | tuple[int, int], np.floating], Y: np.ndarray[tuple[int] | tuple[int, int], np.floating], *, aligned: bool = False, nans_as_zeros: bool = True ) -> float | np.ndarray[tuple[int, ...], np.floating]: """Cosine similarity between two vectors. When 2D arrays are passed it is assumed that vectors for calculating similarities are arranged in rows. Parameters ---------- X, Y Vectors or arrays of vectors. aligned If ``True`` then ``X`` and ``Y`` have to be 2D and of the same shape and row-by-row similarities are calculated. nans_as_zeros Should NaN values arising from zero vector norm be interpreted as zero similarities. """ if aligned: if X.ndim != 2: raise ValueError("'X' and 'Y' must be 2D when 'aligned=True'") if X.shape != Y.shape: raise ValueError("'X' and 'Y' have to be of the same shape when 'aligned=True'") Xnorm = np.linalg.norm(X, axis=1) Ynorm = np.linalg.norm(Y, axis=1) sim = (X*Y).sum(axis=1) if nans_as_zeros: mask = (Xnorm != 0) & (Ynorm != 0) sim = sim[mask] / (Xnorm*Ynorm)[mask] else: sim /= Xnorm*Ynorm return sim Xnorm = norm(X.T, axis=0) Ynorm = norm(Y.T, axis=0) if nans_as_zeros: Xnz = Xnorm != 0 Ynz = Ynorm != 0 cos = (X@Y.T)[Xnz][:, Ynz] cos = np.clip(cos / np.outer(Xnorm[Xnz], Ynorm[Ynz]), -1, 1) else: cos = X@Y.T cos = np.clip(cos / np.outer(Xnorm, Ynorm), -1, 1) if cos.size == 1: return float(cos[0][0]) return cos.squeeze()
def best_matches( objs: Iterable, others: Iterable, func: Callable[[Any, Any], int | float], *args: Any, **kwds: Any ) -> Iterable[tuple[int | float, Any, Any]]: objs = tuple(objs) others = tuple(others) idx = 1 if len(objs) <= len(others) else 2 pairs = sorted(( (func(obj, other, *args, **kwds), obj, other) for obj, other in product(objs, others) ), key=lambda x: -x[0]) yield from unique_everseen(pairs, key=lambda x: x[idx]) def sort_map(mapping: Mapping) -> Mapping: return mapping.__class__(sorted(mapping.items(), key=lambda x: x[0]))
[docs] def stringify(obj: Any, **kwds: Any) -> str: """Convert ``obj`` to string. If ``obj`` exposes ``to_str()`` then it is used with keyword arguments passed in ``**kwds``. Otherwise the plain ``__repr__()`` is used. """ if (to_str := getattr(obj, "to_str", None)): return to_str(**kwds) return repr(obj)
[docs] def ensure_cpu_vectors(vocab: Vocab | Any) -> None: """Ensure that word vectors are stored on CPU. Parameters ---------- vocab Vocabulary object. If an arbitrary object is passed then an attempt at retrieving ``.vocab`` attribute is made. """ if not isinstance(vocab, Vocab): vocab = vocab.vocab if not isinstance(vocab.vectors.data, np.ndarray): vocab.vectors.data = vocab.vectors.data.get()
[docs] def prefer_gpu_vectors( vocab: Vocab | Any, device_id: int | None = None ) -> bool: """Store word vectors on GPU if possible. Parameters ---------- Vocabulary object. If an arbitrary object is passed then an attempt at retrieving ``.vocab`` attribute is made. device_id GPU device id. If ``None`` then the default device is used (typically it is with id ``0``). Returns ------- bool Specifies whether the vectors where successfully moved to GPU. """ if not isinstance(vocab, Vocab): vocab = vocab.vocab data = vocab.vectors.data if isinstance(data, np.ndarray): try: import cupy as cp # pylint: disable=import-outside-toplevel except ImportError: return False if device_id is not None: with cp.cuda.Device(device_id): data = cp.asarray(data) else: data = cp.asarray(data) vocab.vectors.data = data return True