Source code for segram.utils.misc

# pylint: disable=no-name-in-module
from typing import Any, Callable, Iterable, Mapping
from itertools import product
from more_itertools import unique_everseen
import numpy as np
from numpy.linalg import norm
from spacy.vocab import Vocab



[docs]
def cosine_similarity(
    X: np.ndarray[tuple[int] | tuple[int, int], np.floating],
    Y: np.ndarray[tuple[int] | tuple[int, int], np.floating],
    *,
    aligned: bool = False,
    nans_as_zeros: bool = True
) -> float | np.ndarray[tuple[int, ...], np.floating]:
    """Cosine similarity between two vectors.

    When 2D arrays are passed it is assumed that vectors
    for calculating similarities are arranged in rows.

    Parameters
    ----------
    X, Y
        Vectors or arrays of vectors.
    aligned
        If ``True`` then ``X`` and ``Y`` have to be 2D and of the
        same shape and row-by-row similarities are calculated.
    nans_as_zeros
        Should NaN values arising from zero vector norm
        be interpreted as zero similarities.
    """
    if aligned:
        if X.ndim != 2:
            raise ValueError("'X' and 'Y' must be 2D when 'aligned=True'")
        if X.shape != Y.shape:
            raise ValueError("'X' and 'Y' have to be of the same shape when 'aligned=True'")
        Xnorm = np.linalg.norm(X, axis=1)
        Ynorm = np.linalg.norm(Y, axis=1)
        sim = (X*Y).sum(axis=1)
        if nans_as_zeros:
            mask = (Xnorm != 0) & (Ynorm != 0)
            sim = sim[mask] / (Xnorm*Ynorm)[mask]
        else:
            sim /= Xnorm*Ynorm
        return sim
    Xnorm = norm(X.T, axis=0)
    Ynorm = norm(Y.T, axis=0)
    if nans_as_zeros:
        Xnz = Xnorm != 0
        Ynz = Ynorm != 0
        cos = (X@Y.T)[Xnz][:, Ynz]
        cos = np.clip(cos / np.outer(Xnorm[Xnz], Ynorm[Ynz]), -1, 1)
    else:
        cos = X@Y.T
        cos = np.clip(cos / np.outer(Xnorm, Ynorm), -1, 1)
    if cos.size == 1:
        return float(cos[0][0])
    return cos.squeeze()



def best_matches(
    objs: Iterable,
    others: Iterable,
    func: Callable[[Any, Any], int | float],
    *args: Any,
    **kwds: Any
) -> Iterable[tuple[int | float, Any, Any]]:
    objs = tuple(objs)
    others = tuple(others)
    idx = 1 if len(objs) <= len(others) else 2
    pairs = sorted((
        (func(obj, other, *args, **kwds), obj, other)
        for obj, other in product(objs, others)
    ), key=lambda x: -x[0])
    yield from unique_everseen(pairs, key=lambda x: x[idx])


def sort_map(mapping: Mapping) -> Mapping:
    return mapping.__class__(sorted(mapping.items(), key=lambda x: x[0]))



[docs]
def stringify(obj: Any, **kwds: Any) -> str:
    """Convert ``obj`` to string.

    If ``obj`` exposes ``to_str()`` then it is used
    with keyword arguments passed in ``**kwds``.
    Otherwise the plain ``__repr__()`` is used.
    """
    if (to_str := getattr(obj, "to_str", None)):
        return to_str(**kwds)
    return repr(obj)




[docs]
def ensure_cpu_vectors(vocab: Vocab | Any) -> None:
    """Ensure that word vectors are stored on CPU.

    Parameters
    ----------
    vocab
        Vocabulary object.
        If an arbitrary object is passed then an attempt
        at retrieving ``.vocab`` attribute is made.
    """
    if not isinstance(vocab, Vocab):
        vocab = vocab.vocab
    if not isinstance(vocab.vectors.data, np.ndarray):
        vocab.vectors.data = vocab.vectors.data.get()



[docs]
def prefer_gpu_vectors(
    vocab: Vocab | Any,
    device_id: int | None = None
) -> bool:
    """Store word vectors on GPU if possible.

    Parameters
    ----------
    Vocabulary object.
        If an arbitrary object is passed then an attempt
        at retrieving ``.vocab`` attribute is made.
    device_id
        GPU device id. If ``None`` then the default device
        is used (typically it is with id ``0``).

    Returns
    -------
    bool
        Specifies whether the vectors where successfully moved to GPU.
    """
    if not isinstance(vocab, Vocab):
        vocab = vocab.vocab
    data = vocab.vectors.data
    if isinstance(data, np.ndarray):
        try:
            import cupy as cp # pylint: disable=import-outside-toplevel
        except ImportError:
            return False
        if device_id is not None:
            with cp.cuda.Device(device_id):
                data = cp.asarray(data)
        else:
            data = cp.asarray(data)
        vocab.vectors.data = data
    return True