Source code for embedded_voting.embeddings_from_ratings.embeddings_from_ratings_correlation

import numpy as np
from embedded_voting.ratings.ratings import Ratings
from embedded_voting.embeddings.embeddings_correlation import EmbeddingsCorrelation
from embedded_voting.embeddings_from_ratings.embeddings_from_ratings import EmbeddingsFromRatings
from embedded_voting.utils.miscellaneous import normalize, center_and_normalize
from sklearn.decomposition import PCA


[docs]class EmbeddingsFromRatingsCorrelation(EmbeddingsFromRatings):
    """
    Use the correlation with each voter as the embeddings.

    Morally, we have two levels of embedding.

    * First, `v_i = preprocess_ratings(ratings_voter_i)` for each voter `i`, which is used as a
      computation step but not recorded.
    * Second, `M = v @ v.T`, which is recorded as the final embeddings.

    Other attributes are computed and recorded:

    * `n_sing_val`: the number of relevant singular values when we compute the SVD.
      This is based on the Principal Component Analysis (PCA).
    * `ratings_means`: the mean rating for each voter (without preprocessing).
    * `ratings_stds`: the standard deviation of the ratings for each voter (without preprocessing).

    Examples
    --------
    >>> np.random.seed(42)
    >>> ratings = np.ones((5, 3))
    >>> generator = EmbeddingsFromRatingsCorrelation(preprocess_ratings=normalize)
    >>> embeddings = generator(ratings)
    >>> embeddings
    EmbeddingsCorrelation([[1., 1., 1., 1., 1.],
                           [1., 1., 1., 1., 1.],
                           [1., 1., 1., 1., 1.],
                           [1., 1., 1., 1., 1.],
                           [1., 1., 1., 1., 1.]])
    >>> embeddings.n_sing_val
    1

    In fact, the typical usage is with `center_and_normalize`:

    >>> generator = EmbeddingsFromRatingsCorrelation(preprocess_ratings=center_and_normalize)
    >>> embeddings = generator(ratings)
    >>> embeddings
    EmbeddingsCorrelation([[0., 0., 0., 0., 0.],
                           [0., 0., 0., 0., 0.],
                           [0., 0., 0., 0., 0.],
                           [0., 0., 0., 0., 0.],
                           [0., 0., 0., 0., 0.]])
    >>> embeddings.n_sing_val
    0
    """
    def __init__(self, preprocess_ratings=None, svd_factor=0.95):
        super().__init__()
        self.svd_factor = svd_factor
        self.preprocess_ratings = preprocess_ratings

    def __call__(self, ratings):
        ratings = Ratings(ratings)
        if self.preprocess_ratings is None:
            ratings_preprocessed = ratings
        else:
            ratings_preprocessed = Ratings([self.preprocess_ratings(ratings_voter) for ratings_voter in ratings])

        u, s, v = np.linalg.svd(ratings_preprocessed)
        s = np.sqrt(s)
        s_sum = s.sum()
        if self.svd_factor == "pca":
            pca_sk = PCA(n_components='mle')
            if ratings_preprocessed.shape[0] > ratings_preprocessed.shape[1]:
                pca_sk.fit(ratings_preprocessed)
            else:
                pca_sk.fit(ratings_preprocessed.T)
            n_v = pca_sk.n_components_
        else:
            if self.svd_factor == "div":
                add_div = 1
                svd_factor = 1
            else:
                add_div = 0
                svd_factor = self.svd_factor
            if s_sum == 0:
                n_v = 0
            else:
                s /= s_sum
                n_v = 0
                for s_e in s:
                    if s_e >= svd_factor*max(1 / (ratings.n_voters+add_div), 1 / (ratings.n_candidates+add_div)):
                        n_v += 1


        embeddings = EmbeddingsCorrelation(
            positions=np.dot(ratings_preprocessed, ratings_preprocessed.T),
            n_sing_val=n_v,
            ratings_means=ratings.mean(axis=1),
            ratings_stds=ratings.std(axis=1),
            norm=False
        )
        return embeddings