Source code for embedded_voting.embeddings_from_ratings.embeddings_from_ratings_correlation

import numpy as np
from embedded_voting.ratings.ratings import Ratings
from embedded_voting.embeddings.embeddings_correlation import EmbeddingsCorrelation
from embedded_voting.embeddings_from_ratings.embeddings_from_ratings import EmbeddingsFromRatings
from embedded_voting.utils.miscellaneous import normalize, center_and_normalize
from sklearn.decomposition import PCA


[docs]class EmbeddingsFromRatingsCorrelation(EmbeddingsFromRatings): """ Use the correlation with each voter as the embeddings. Morally, we have two levels of embedding. * First, `v_i = preprocess_ratings(ratings_voter_i)` for each voter `i`, which is used as a computation step but not recorded. * Second, `M = v @ v.T`, which is recorded as the final embeddings. Other attributes are computed and recorded: * `n_sing_val`: the number of relevant singular values when we compute the SVD. This is based on the Principal Component Analysis (PCA). * `ratings_means`: the mean rating for each voter (without preprocessing). * `ratings_stds`: the standard deviation of the ratings for each voter (without preprocessing). Examples -------- >>> np.random.seed(42) >>> ratings = np.ones((5, 3)) >>> generator = EmbeddingsFromRatingsCorrelation(preprocess_ratings=normalize) >>> embeddings = generator(ratings) >>> embeddings EmbeddingsCorrelation([[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.]]) >>> embeddings.n_sing_val 1 In fact, the typical usage is with `center_and_normalize`: >>> generator = EmbeddingsFromRatingsCorrelation(preprocess_ratings=center_and_normalize) >>> embeddings = generator(ratings) >>> embeddings EmbeddingsCorrelation([[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]]) >>> embeddings.n_sing_val 0 """ def __init__(self, preprocess_ratings=None, svd_factor=0.95): super().__init__() self.svd_factor = svd_factor self.preprocess_ratings = preprocess_ratings def __call__(self, ratings): ratings = Ratings(ratings) if self.preprocess_ratings is None: ratings_preprocessed = ratings else: ratings_preprocessed = Ratings([self.preprocess_ratings(ratings_voter) for ratings_voter in ratings]) u, s, v = np.linalg.svd(ratings_preprocessed) s = np.sqrt(s) s_sum = s.sum() if self.svd_factor == "pca": pca_sk = PCA(n_components='mle') if ratings_preprocessed.shape[0] > ratings_preprocessed.shape[1]: pca_sk.fit(ratings_preprocessed) else: pca_sk.fit(ratings_preprocessed.T) n_v = pca_sk.n_components_ else: if self.svd_factor == "div": add_div = 1 svd_factor = 1 else: add_div = 0 svd_factor = self.svd_factor if s_sum == 0: n_v = 0 else: s /= s_sum n_v = 0 for s_e in s: if s_e >= svd_factor*max(1 / (ratings.n_voters+add_div), 1 / (ratings.n_candidates+add_div)): n_v += 1 embeddings = EmbeddingsCorrelation( positions=np.dot(ratings_preprocessed, ratings_preprocessed.T), n_sing_val=n_v, ratings_means=ratings.mean(axis=1), ratings_stds=ratings.std(axis=1), norm=False ) return embeddings