Recsys matrix decomposition

In [68]:

Copied!

# !wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip -o ml-latest-small.zip
# !wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip -o ml-latest-small.zip

In [26]:

Copied!

# pip install replay-rec --quiet
# pip install implicit -qqq
# pip install replay-rec --quiet
# pip install implicit -qqq

Note: you may need to restart the kernel to use updated packages.

Методы матричной факторизации¶

Импортируем библиотеки ¶

Будем использовать implicit для ALS
Нам так же нужна бибиотека replay-rec, из нее будем использовать replay.metrics

In [28]:

Copied!





import numpy as np
import pandas as pd
import os
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import scipy.sparse as sparse
import scipy
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings; warnings.filterwarnings('ignore')

from replay.metrics import HitRate, NDCG, MAP, Experiment # подсчет метрик 
import implicit # для ALS
import numpy as np
import pandas as pd
import os
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import scipy.sparse as sparse
import scipy
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings; warnings.filterwarnings('ignore')

from replay.metrics import HitRate, NDCG, MAP, Experiment # подсчет метрик 
import implicit # для ALS

Датасет реитингов пользователей ¶

Рассмотрим датасет от GroupLens $-$ MovieLens: Это набор данных из $27 000$ фильмов и $138 000$ пользователей, с общим количеством оценок в $20$ миллионов.

Но мы воспользуемся уменьшенной версией для быстроты вычислений: $9 000$ фильмов, $700$ пользователей, $100 000$ оценок. Скачать напрямую датасет можно по этой ссылке

In [29]:

Copied!

# data source 1
ratings = pd.read_csv('./ml-latest-small/ratings.csv', parse_dates=['timestamp'])
ratings.head()
# data source 1
ratings = pd.read_csv('./ml-latest-small/ratings.csv', parse_dates=['timestamp'])
ratings.head()

Out[29]:

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931

In [30]:

Copied!

print(f'Уникальных юзеров: {ratings["userId"].nunique()}')
print(f'Уникальных фильмов: {ratings["movieId"].nunique()}')
print(f'Уникальных юзеров: {ratings["userId"].nunique()}')
print(f'Уникальных фильмов: {ratings["movieId"].nunique()}')

Уникальных юзеров: 610
Уникальных фильмов: 9724

In [31]:

Copied!

ratings['rating'].value_counts()
ratings['rating'].value_counts()

Out[31]:

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

Посчитаем сколько раз каждый фильм был оценен
Выбираем только те фильмы которые имеют больше 20 оценок
Из ratings фильтруем те фильмы которые имеют больше 20 оценок

In [34]:

Copied!





"""

Select subset of ratings data

"""

# count how many times each movie was rated
item_counts = ratings.groupby('movieId')['userId'].count() # number of ratings for each movie
pop_items = item_counts[item_counts > 20] # select only movies with counts > 20
ratings = ratings[ratings['movieId'].isin(pop_items.index)] # select subset of user ratings for movies that have counts > 20
ratings
"""

Select subset of ratings data

"""

# count how many times each movie was rated
item_counts = ratings.groupby('movieId')['userId'].count() # number of ratings for each movie
pop_items = item_counts[item_counts > 20] # select only movies with counts > 20
ratings = ratings[ratings['movieId'].isin(pop_items.index)] # select subset of user ratings for movies that have counts > 20
ratings

Out[34]:

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931
...	...	...	...	...
100803	610	148626	4.0	1493847175
100808	610	152081	4.0	1493846503
100829	610	164179	5.0	1493845631
100830	610	166528	4.0	1493879365
100834	610	168252	5.0	1493846352

66658 rows × 4 columns

создалим свою еннумерацию для всех пользователей и фильмов

In [35]:

Copied!





all_users = ratings['userId'].unique().tolist()  # all unique users in ratings data
all_items = ratings['movieId'].unique().tolist() # all unique movies in ratings data

n_users = ratings['userId'].nunique() # number of unique users
n_items = ratings['movieId'].nunique() # number of unique movies

user_id2idx = dict(zip(all_users, range(n_users)))
item_id2idx = dict(zip(all_items, range(n_items)))

ratings['userId'] = ratings['userId'].map(user_id2idx) # redefine user id  (for better interpretation)
ratings['movieId'] = ratings['movieId'].map(item_id2idx) # redefine movie id (for better interpretation)
all_users = ratings['userId'].unique().tolist()  # all unique users in ratings data
all_items = ratings['movieId'].unique().tolist() # all unique movies in ratings data

n_users = ratings['userId'].nunique() # number of unique users
n_items = ratings['movieId'].nunique() # number of unique movies

user_id2idx = dict(zip(all_users, range(n_users)))
item_id2idx = dict(zip(all_items, range(n_items)))

ratings['userId'] = ratings['userId'].map(user_id2idx) # redefine user id  (for better interpretation)
ratings['movieId'] = ratings['movieId'].map(item_id2idx) # redefine movie id (for better interpretation)

Разбиваем Выборку¶

Сгрупируем все рейтинги для каждого userId
Выбираем последнюю по дате рейтинг у каждого пользователя и кладем ее в test
все оствльное у нас идет в train

In [36]:

Copied!

# group
full_history = ratings.sort_values(['userId', 'timestamp']).groupby('userId') # group by userid w/ sorted data

test = full_history.tail(1) # get last movie rating for each user
train = full_history.head(-1) # get all other movie ratings for each user except for first

train.shape, test.shape
# group
full_history = ratings.sort_values(['userId', 'timestamp']).groupby('userId') # group by userid w/ sorted data

test = full_history.tail(1) # get last movie rating for each user
train = full_history.head(-1) # get all other movie ratings for each user except for first

train.shape, test.shape

Out[36]:

((66048, 4), (610, 4))

In [37]:

Copied!

train = train.reset_index()
test = test.reset_index()
train = train.reset_index()
test = test.reset_index()

Оставим толко те рейтинги фильмов для которых они постввили 5

In [38]:

Copied!

test = test[test['rating'] == 5] # lets select only users who give 5.0 ratings to movies
test.shape
test = test[test['rating'] == 5] # lets select only users who give 5.0 ratings to movies
test.shape

Out[38]:

(117, 5)

Вспомогательная Функция¶

Создалим базовый родительский класс от которого наши дальнейшие классы SVD, FunkSVD будут наследовать. ALS тоже будет использовать шаги предобработки в этом классе.

BaseFactorizationModel

get_rating_matrix принимает на вход обучающую выборку, как и раньше мы создаем сводную таблицу с пользователями и фильмами, где числа это соответсвующий рейтинг пользователь поставил
predict делает что и раньше, сортирует и выбирает для каждого пользователя топ к фильмов и возвращает movieId и его score из SVD

In [1]:

Copied!





class BaseFactorizationModel:
    def __init__(self, random_state=0, user_col='userId', item_col='movieId', rating_col='rating'):
        self.random_state = np.random.RandomState(random_state)
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.user_matrix = None
        self.item_matrix = None

    # matrix which we will decompose
    def get_rating_matrix(self, data):
        return pd.pivot_table(
            data,
            values=self.rating_col,
            index=self.user_col,
            columns=self.item_col,
            fill_value=0
            ).values

    
    """
    
    When we receive the matrix with scores, for each user
    sort and get the top k 
    
    """
    
    def predict(self, 
                scores,  # (user,film) score matrix
                rating_matrix=None, # (user,film) rating matrix
                filter_seen=True, 
                k=10):
        
        # filter out films that already have been seen 
        if filter_seen:
            scores = np.multiply(scores,
            np.invert(rating_matrix.astype(bool))
            )

        # scores index : userId column : filmId 

        # get indicies of top k scores (indicies : movieId) in user array
        ind_part = np.argpartition(scores, -k + 1)[:, -k:].copy()  

        # get the values of the top k scores 
        scores_not_sorted = np.take_along_axis(scores, ind_part, axis=1)

        # indicies of scores from lowest to highest 
        ind_sorted = np.argsort(scores_not_sorted, axis=1) # 

        # scores from lowest to highest 
        scores_sorted = np.sort(scores_not_sorted, axis=1) 

        # get the indicies of the movieId with the highest scores
        indices = np.take_along_axis(ind_part, ind_sorted, axis=1)

        # for each user return the movies with the highest scores
        preds = pd.DataFrame({
            self.user_col: range(scores.shape[0]), # each user 
            self.item_col: np.flip(indices, axis=1).tolist(), # movieId index 
            self.rating_col: np.flip(scores_sorted, axis=1).tolist() # movieId score
            })
        
        # convert arrays (user_col,item_col) into rows for each user 
        preds = preds.explode([self.item_col, self.rating_col])

        return preds
class BaseFactorizationModel:
    def __init__(self, random_state=0, user_col='userId', item_col='movieId', rating_col='rating'):
        self.random_state = np.random.RandomState(random_state)
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.user_matrix = None
        self.item_matrix = None

    # matrix which we will decompose
    def get_rating_matrix(self, data):
        return pd.pivot_table(
            data,
            values=self.rating_col,
            index=self.user_col,
            columns=self.item_col,
            fill_value=0
            ).values

    
    """
    
    When we receive the matrix with scores, for each user
    sort and get the top k 
    
    """
    
    def predict(self, 
                scores,  # (user,film) score matrix
                rating_matrix=None, # (user,film) rating matrix
                filter_seen=True, 
                k=10):
        
        # filter out films that already have been seen 
        if filter_seen:
            scores = np.multiply(scores,
            np.invert(rating_matrix.astype(bool))
            )

        # scores index : userId column : filmId 

        # get indicies of top k scores (indicies : movieId) in user array
        ind_part = np.argpartition(scores, -k + 1)[:, -k:].copy()  

        # get the values of the top k scores 
        scores_not_sorted = np.take_along_axis(scores, ind_part, axis=1)

        # indicies of scores from lowest to highest 
        ind_sorted = np.argsort(scores_not_sorted, axis=1) # 

        # scores from lowest to highest 
        scores_sorted = np.sort(scores_not_sorted, axis=1) 

        # get the indicies of the movieId with the highest scores
        indices = np.take_along_axis(ind_part, ind_sorted, axis=1)

        # for each user return the movies with the highest scores
        preds = pd.DataFrame({
            self.user_col: range(scores.shape[0]), # each user 
            self.item_col: np.flip(indices, axis=1).tolist(), # movieId index 
            self.rating_col: np.flip(scores_sorted, axis=1).tolist() # movieId score
            })
        
        # convert arrays (user_col,item_col) into rows for each user 
        preds = preds.explode([self.item_col, self.rating_col])

        return preds

SVD¶

SVD из scipy

В базовом методе 1 гиперпараметр n_factors
включает один метод fit; генерирует матрицу рейтингов для каждого уникального пользователя и фильма

Смысл SVD разложения:

Ииспользует методы линейной алгебры для нахождения этих матриц и может быть вычислен с помощью прямых методов, таких как QR-разложение
Мы формируем матрицу (get_rating_matrix) рейтингов фильмов movieId для каждого пользователя userId в выборке
SVD разлагает эту матрицу взаимодейстий пользователей на три матрицы (U,
Оставляя только размерность k (latern features) для матрих U (userId,k) и V (movieID,k) и ΣI (k,k)
Эта апроксимации матрицы U и V захватывают скрытые признаки, представляющие основные закономерности в данных
Если мы не указываем n_factors то мы получаем наиболее точную разложение мартицы рейтинга (get_rating_matrix)

In [41]:

Copied!





class SVD(BaseFactorizationModel):
    
    def __init__(self, 
                 random_state=0, 
                 user_col='userId', 
                 item_col='movieId',
                 n_factors=20): # hyperparameter
        super().__init__(random_state, user_col, item_col)
        self.n_factors = n_factors

    """
    
    Calculate the scores for each user
    
    """
        
    def fit(self, data):
        
        # user,movie rating matrix
        self.rating_matrix = self.get_rating_matrix(data) # (unique users,unique films) ratings
        csr_rating_matrix = csr_matrix(self.rating_matrix.astype(float))
        
        # svd decomposition 
        user_matrix, singular_values, item_matrix = svds(A=csr_rating_matrix, 
                                                         k=self.n_factors)
    
        user_matrix = user_matrix * np.sqrt(singular_values)  # (unique users,k latent features)
        item_matrix = item_matrix.T * np.sqrt(singular_values) # (unique films, k latent features)
        self.scores = user_matrix @ item_matrix.T  # score matrix for each user & film (unique users, unique films)
        
        self.user_matrix = user_matrix
        self.item_matrix = item_matrix
class SVD(BaseFactorizationModel):
    
    def __init__(self, 
                 random_state=0, 
                 user_col='userId', 
                 item_col='movieId',
                 n_factors=20): # hyperparameter
        super().__init__(random_state, user_col, item_col)
        self.n_factors = n_factors

    """
    
    Calculate the scores for each user
    
    """
        
    def fit(self, data):
        
        # user,movie rating matrix
        self.rating_matrix = self.get_rating_matrix(data) # (unique users,unique films) ratings
        csr_rating_matrix = csr_matrix(self.rating_matrix.astype(float))
        
        # svd decomposition 
        user_matrix, singular_values, item_matrix = svds(A=csr_rating_matrix, 
                                                         k=self.n_factors)
    
        user_matrix = user_matrix * np.sqrt(singular_values)  # (unique users,k latent features)
        item_matrix = item_matrix.T * np.sqrt(singular_values) # (unique films, k latent features)
        self.scores = user_matrix @ item_matrix.T  # score matrix for each user & film (unique users, unique films)
        
        self.user_matrix = user_matrix
        self.item_matrix = item_matrix

In [42]:

Copied!

svd_model = SVD()
svd_model.fit(train)
svd_model = SVD()
svd_model.fit(train)

In [43]:

Copied!

preds_svd = svd_model.predict(svd_model.scores,        # user movie scores from svd decomposition
                              svd_model.rating_matrix) # user movie ratings 
preds_svd
preds_svd = svd_model.predict(svd_model.scores,        # user movie scores from svd decomposition
                              svd_model.rating_matrix) # user movie ratings 
preds_svd

Out[43]:

	userId	movieId	rating
0	0	676	3.941808
0	0	677	3.500389
0	0	588	3.180376
0	0	593	3.135802
0	0	1062	2.902454
...	...	...	...
609	609	1158	3.092268
609	609	61	3.092064
609	609	1201	3.08826
609	609	717	3.053321
609	609	1062	3.044532

6100 rows × 3 columns

FunkSVD¶

Теперь попробуем с нуля подход FunkSVD

В отличии от SVD модель обучается при помощи градиентного спуска

Мы обучаем матрицы P, Q (U и Q из SVD)

In [44]:

Copied!





class FunkSVD(BaseFactorizationModel):
    def __init__(self, random_state = 0,
                 user_col='userId', 
                 item_col='movieId',
                 rating_col='rating', 
                 lr = 0.01,  # learning rate
                 reg = 0.05, # coefficient of regularisation
                 n_factors=20, # size of vectors
                 n_epochs=5): # number of iterations
        super().__init__(random_state, user_col, item_col, rating_col)
        self.lr = lr
        self.reg = reg
        self.n_factors = n_factors
        self.n_epochs = n_epochs


    def fit(self, data):
        
        self.rating_matrix = self.get_rating_matrix(data)
        n_users, n_items = self.rating_matrix.shape

        """
        
        Parameter Initialisation
        
        mu : глобальная средняя оценка
        bu : смещение оценок пользователя (на сколько отличается средняя оценка от глобального)
        bi : смещение оценок объектов (на сколько отличается средняя оценка от глобального)
        
        P, Q : матрицы которые мы будем обучать 
        
        """
        mu = data[self.rating_col].mean() # global average rating
        
        # initialise shifts 
        # how much avg rating of user differs from global avg
        bu = np.zeros(n_users)  
        bi = np.zeros(n_items)
        
        # we need to learn the matricies for users and items
        P = self.random_state.normal(size=(n_users, self.n_factors))
        Q = self.random_state.normal(size=(n_items, self.n_factors))

        # number of iterations we will be updating our weights (matrices)
        for _ in tqdm(range(self.n_epochs)):
            
            errs = []
            for _, row in data.iterrows():
                
                # Get ids and ratings
                user_id = row[self.user_col]
                item_id = row[self.item_col]
                rating = row[self.rating_col]

                # Predict current rating
                # global avg + shift of user/item + scalar vector multiplication
                
                pred = mu + bu[user_id] + bi[item_id] + Q[item_id] @ P[user_id]

                # calculate the difference b/w prediction and actual user ratings
                err = rating - pred
                errs.append(abs(err)) 

                """
                
                Update Biases bu/bi
                
                """

                bu[user_id] += self.lr * (err - self.reg * bu[user_id])
                bi[item_id] += self.lr * (err - self.reg * bi[item_id])
                
                """
                
                Update latent factors
                
                """

                P[user_id] += self.lr * (err * Q[item_id] - self.reg * P[user_id])
                Q[item_id] += self.lr * (err * P[user_id] - self.reg * Q[item_id])

            print(round(np.mean(errs), 4))
            
        # restore all data
        self.user_matrix = P # (userId,k)
        self.item_matrix = Q # (itemId,k)
        self.user_bias = bu  # (userId,)
        self.item_bias = bi  # (itemId,)
        self.mu = mu  # global doesn't change
        
        #             [ dont really neeed ]  
        self.scores = mu + bu.reshape(-1, 1) + bi + P @ Q.T
class FunkSVD(BaseFactorizationModel):
    def __init__(self, random_state = 0,
                 user_col='userId', 
                 item_col='movieId',
                 rating_col='rating', 
                 lr = 0.01,  # learning rate
                 reg = 0.05, # coefficient of regularisation
                 n_factors=20, # size of vectors
                 n_epochs=5): # number of iterations
        super().__init__(random_state, user_col, item_col, rating_col)
        self.lr = lr
        self.reg = reg
        self.n_factors = n_factors
        self.n_epochs = n_epochs


    def fit(self, data):
        
        self.rating_matrix = self.get_rating_matrix(data)
        n_users, n_items = self.rating_matrix.shape

        """
        
        Parameter Initialisation
        
        mu : глобальная средняя оценка
        bu : смещение оценок пользователя (на сколько отличается средняя оценка от глобального)
        bi : смещение оценок объектов (на сколько отличается средняя оценка от глобального)
        
        P, Q : матрицы которые мы будем обучать 
        
        """
        mu = data[self.rating_col].mean() # global average rating
        
        # initialise shifts 
        # how much avg rating of user differs from global avg
        bu = np.zeros(n_users)  
        bi = np.zeros(n_items)
        
        # we need to learn the matricies for users and items
        P = self.random_state.normal(size=(n_users, self.n_factors))
        Q = self.random_state.normal(size=(n_items, self.n_factors))

        # number of iterations we will be updating our weights (matrices)
        for _ in tqdm(range(self.n_epochs)):
            
            errs = []
            for _, row in data.iterrows():
                
                # Get ids and ratings
                user_id = row[self.user_col]
                item_id = row[self.item_col]
                rating = row[self.rating_col]

                # Predict current rating
                # global avg + shift of user/item + scalar vector multiplication
                
                pred = mu + bu[user_id] + bi[item_id] + Q[item_id] @ P[user_id]

                # calculate the difference b/w prediction and actual user ratings
                err = rating - pred
                errs.append(abs(err)) 

                """
                
                Update Biases bu/bi
                
                """

                bu[user_id] += self.lr * (err - self.reg * bu[user_id])
                bi[item_id] += self.lr * (err - self.reg * bi[item_id])
                
                """
                
                Update latent factors
                
                """

                P[user_id] += self.lr * (err * Q[item_id] - self.reg * P[user_id])
                Q[item_id] += self.lr * (err * P[user_id] - self.reg * Q[item_id])

            print(round(np.mean(errs), 4))
            
        # restore all data
        self.user_matrix = P # (userId,k)
        self.item_matrix = Q # (itemId,k)
        self.user_bias = bu  # (userId,)
        self.item_bias = bi  # (itemId,)
        self.mu = mu  # global doesn't change
        
        #             [ dont really neeed ]  
        self.scores = mu + bu.reshape(-1, 1) + bi + P @ Q.T

In [45]:

Copied!

funk_model = FunkSVD()
funk_model.fit(train)
funk_model = FunkSVD()
funk_model.fit(train)

 20%|██        | 1/5 [00:06<00:24,  6.10s/it]

1.7948

 40%|████      | 2/5 [00:12<00:18,  6.13s/it]

0.8553

 60%|██████    | 3/5 [00:18<00:12,  6.11s/it]

0.728

 80%|████████  | 4/5 [00:24<00:06,  6.11s/it]

0.6719

100%|██████████| 5/5 [00:30<00:00,  6.13s/it]

0.6401

In [46]:

Copied!

preds_funk = funk_model.predict(funk_model.scores, 
                                funk_model.rating_matrix)
preds_funk = funk_model.predict(funk_model.scores, 
                                funk_model.rating_matrix)

In [47]:

Copied!

preds_funk
preds_funk

Out[47]:

	userId	movieId	rating
0	0	544	5.201155
0	0	677	5.169179
0	0	1042	5.163057
0	0	1016	5.158277
0	0	215	5.151932
...	...	...	...
609	609	6	4.403011
609	609	208	4.38257
609	609	254	4.38018
609	609	270	4.372523
609	609	1107	4.366516

6100 rows × 3 columns

In [48]:

Copied!

k = funk_model.scores.shape[1] # number of movieId
print(k)
k = funk_model.scores.shape[1] # number of movieId
print(k)

In [49]:

Copied!





preds_funk = funk_model.predict(funk_model.scores, 
                                funk_model.rating_matrix,
                                k=k)
test_pred = test.merge(preds_funk, on=['userId', 'movieId'])
preds_funk = funk_model.predict(funk_model.scores, 
                                funk_model.rating_matrix,
                                k=k)
test_pred = test.merge(preds_funk, on=['userId', 'movieId'])

In [50]:

Copied!

test_pred
test_pred

Out[50]:

	index	userId	movieId	rating_x	timestamp	rating_y
0	839	5	40	5.0	845556915	3.611139
1	4160	26	134	5.0	965151428	2.638579
2	4876	29	1014	5.0	1500370457	2.947382
3	5375	36	418	5.0	845927014	3.46253
4	5756	40	730	5.0	1459369130	1.824829
...	...	...	...	...	...	...
112	89049	575	1178	5.0	1358151542	-0.153748
113	89226	577	522	5.0	1300996817	6.002605
114	89245	578	331	5.0	977364909	2.795876
115	89912	583	336	5.0	834988340	9.314807
116	90392	586	913	5.0	953142269	2.811585

117 rows × 6 columns

In [51]:

Copied!

mean_absolute_error(test_pred['rating_x'], test_pred['rating_y'])
mean_absolute_error(test_pred['rating_x'], test_pred['rating_y'])

Out[51]:

1.67207526461794

iALS¶

Метод ALS похож на funcSVD, они оба являются итеративные медоты обучения

Но в ALS мы фиксируем матрицу пользователей userId и обновляем веса itemId
Потом мы фиксируем матрицу itemId и обновляем веса пользователей
Метод хорошо параллелится и можем найти решение явным виде (не методом градиентном спуском)
Повторяем это до схлдимости используя наименших квадратов в обоих подзадач

Отличие iALS от ALS

iALS позволяет выполнять обучение инкрементально; модель может обновляться по мере поступления новых данных без необходимости переобучать всю модель с нуля.
Это особенно полезно в сценариях, где данные поступают непрерывно (например, в реальном времени)
ALS менее эффективный для больших и динамически изменяющихся наборов данных, так как требует полного пересчета матриц пользователей и товаров при каждом обновлении.
iALS позволяет более эффективно использовать вычислительные ресурсы, так как изменения в данных могут быть учтены без полного пересчета модели. Это делает его более подходящим для систем рекомендаций с постоянным потоком новых данных.

In [52]:

Copied!





# create the user movie rating matrix 
base_model = BaseFactorizationModel()
rating_matrix = base_model.get_rating_matrix(train)
train_sparse = sparse.csr_matrix(rating_matrix)
# create the user movie rating matrix 
base_model = BaseFactorizationModel()
rating_matrix = base_model.get_rating_matrix(train)
train_sparse = sparse.csr_matrix(rating_matrix)

In [ ]:

Copied!

Как и `funcALS`, метод имеет способ регуляризации, количетсво итерации и размерность матриц пользователей и 
Как и `funcALS`, метод имеет способ регуляризации, количетсво итерации и размерность матриц пользователей и

In [1]:

Copied!





ials_model = implicit.als.AlternatingLeastSquares(factors=20, 
                                                  regularization=0.1, 
                                                  iterations=50, 
                                                  use_gpu=False)
ials_model.fit((train_sparse).astype('double'))
ials_model = implicit.als.AlternatingLeastSquares(factors=20, 
                                                  regularization=0.1, 
                                                  iterations=50, 
                                                  use_gpu=False)
ials_model.fit((train_sparse).astype('double'))

In [54]:

Copied!





user_vecs = ials_model.user_factors
item_vecs = ials_model.item_factors

# проверим по размерам, что мы действиельно не перепутали users и items
print(train_sparse.shape)
print(user_vecs.shape, item_vecs.shape)
user_vecs = ials_model.user_factors
item_vecs = ials_model.item_factors

# проверим по размерам, что мы действиельно не перепутали users и items
print(train_sparse.shape)
print(user_vecs.shape, item_vecs.shape)

(610, 1235)
(610, 20) (1235, 20)

Предсказание как и раньше через скаларное произвидение матриц user_factors и item_factors

In [55]:

Copied!

scores = user_vecs.dot(item_vecs.T)
scores.shape
scores = user_vecs.dot(item_vecs.T)
scores.shape

Out[55]:

(610, 1235)

In [56]:

Copied!

user_vecs[0]
user_vecs[0]

Out[56]:

array([ 1.7469772 ,  1.7536646 , -0.15816154,  0.3728279 ,  1.2622052 ,
        0.77496713,  1.3928413 ,  1.2806208 , -0.45395368,  0.81654304,
        0.812823  ,  1.0866945 , -0.90293694,  2.4070432 ,  1.5287678 ,
        1.9461825 ,  0.14609063,  0.13025095,  1.2569972 , -1.7232617 ],
      dtype=float32)

In [57]:

Copied!

item_vecs[0]
item_vecs[0]

Out[57]:

array([-0.00182583,  0.13658507, -0.14382875,  0.05055814,  0.09778877,
        0.10126482,  0.03720245,  0.09039041,  0.1098939 , -0.29790142,
       -0.04846857, -0.01657198,  0.03136025, -0.08922204,  0.31706864,
       -0.06648538, -0.03854659,  0.06695224,  0.0011029 , -0.06568323],
      dtype=float32)

In [66]:

Copied!

preds_ials = base_model.predict(scores, rating_matrix)
preds_ials = base_model.predict(scores, rating_matrix)

Оценки Моделей¶

В конце оценим метрики качества всех трех подходов

In [67]:

Copied!





K = [10]
metrics = Experiment(
    [
        NDCG(K),
        MAP(K),
        HitRate(K),
    ],
    test,
    query_column='userId', item_column= 'movieId'
)

metrics.add_result("SVD", preds_svd)
metrics.results

metrics.add_result("FunkSVD", preds_funk)
metrics.results

metrics.add_result("iALS", preds_ials)
metrics.results
K = [10]
metrics = Experiment(
    [
        NDCG(K),
        MAP(K),
        HitRate(K),
    ],
    test,
    query_column='userId', item_column= 'movieId'
)

metrics.add_result("SVD", preds_svd)
metrics.results

metrics.add_result("FunkSVD", preds_funk)
metrics.results

metrics.add_result("iALS", preds_ials)
metrics.results

Out[67]:

	NDCG@10	MAP@10	HitRate@10
SVD	0.057592	0.036226	0.128205
FunkSVD	0.002573	0.000950	0.008547
iALS	0.059676	0.042165	0.119658