In [1]:
Copied!
!pip install lightfm --quiet
!pip install lightfm --quiet
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 316.4/316.4 kB 5.8 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done Building wheel for lightfm (setup.py) ... done
In [2]:
Copied!
from lightfm import LightFM
from lightfm.data import Dataset as LFMDataset
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy.sparse import csr_matrix, diags
from scipy.sparse.linalg import svds
from tqdm import tqdm
from lightfm import LightFM
from lightfm.data import Dataset as LFMDataset
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy.sparse import csr_matrix, diags
from scipy.sparse.linalg import svds
from tqdm import tqdm
In [3]:
Copied!
path_items = '/kaggle/input/mts-library/items.csv'
path_mts_lib = '/kaggle/input/mts-library/mts_lib.csv'
path_users = '/kaggle/input/mts-library/users.csv'
path_items = '/kaggle/input/mts-library/items.csv'
path_mts_lib = '/kaggle/input/mts-library/mts_lib.csv'
path_users = '/kaggle/input/mts-library/users.csv'
In [4]:
Copied!
# Функция для подсчета precision
# y_pred item_id
# idx [a,b,c] [a,f,d]
def precision(df: pd.DataFrame,
pred_col='y_pred',
true_col='item_id',
k=20) -> float:
precision_values = []
for _, row in df.iterrows():
num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
num_true = min(k, len(row[true_col]))
precision_values.append(num_relevant / num_true)
return np.mean(precision_values)
# Функция для подсчета precision
# y_pred item_id
# idx [a,b,c] [a,f,d]
def precision(df: pd.DataFrame,
pred_col='y_pred',
true_col='item_id',
k=20) -> float:
precision_values = []
for _, row in df.iterrows():
num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
num_true = min(k, len(row[true_col]))
precision_values.append(num_relevant / num_true)
return np.mean(precision_values)
Контентные методы рекомендаций¶
1 | Задание¶
- Продолжим работу с данными mts-library: https://www.kaggle.com/datasets/sharthz23/mts-library
- Обучите LightFM только на взаимодействиях и только на признаках пользователей и айтемов.
- Сравните качество полученных моделей, а также модели, рассмотренной на занятии, по метрике precision@20
- Напишите выводы о проделанной работе: в каком случае качество модели оказалось лучше, почему.
2 | Чтение данных¶
Взаимодействии пользоватетей и предметов
In [5]:
Copied!
df = pd.read_csv(path_mts_lib)
df.head()
df = pd.read_csv(path_mts_lib)
df.head()
Out[5]:
user_id | item_id | progress | rating | start_date | |
---|---|---|---|---|---|
0 | 126706 | 14433 | 80 | NaN | 2018-01-01 |
1 | 127290 | 140952 | 58 | NaN | 2018-01-01 |
2 | 66991 | 198453 | 89 | NaN | 2018-01-01 |
3 | 46791 | 83486 | 23 | 5.0 | 2018-01-01 |
4 | 79313 | 188770 | 88 | 5.0 | 2018-01-01 |
Возмем данные только для 2018 год
In [6]:
Copied!
df['start_date'] = df['start_date'].astype('datetime64[ns]')
df = df[(df['start_date'] > '2018-01-01') & (df['start_date'] < '2019-01-01')]
print(df['start_date'].min())
print(df['start_date'].max())
df['start_date'] = df['start_date'].astype('datetime64[ns]')
df = df[(df['start_date'] > '2018-01-01') & (df['start_date'] < '2019-01-01')]
print(df['start_date'].min())
print(df['start_date'].max())
2018-01-02 00:00:00 2018-12-31 00:00:00
In [7]:
Copied!
# Загружаем признаки для пользователей и предметов
u_features = pd.read_csv(path_users)
i_features = pd.read_csv(path_items)
i_features.rename(columns={'id': 'item_id'}, inplace=True)
# Загружаем признаки для пользователей и предметов
u_features = pd.read_csv(path_users)
i_features = pd.read_csv(path_items)
i_features.rename(columns={'id': 'item_id'}, inplace=True)
In [8]:
Copied!
display(u_features.head())
display(u_features.head())
user_id | age | sex | |
---|---|---|---|
0 | 1 | 45_54 | NaN |
1 | 2 | 18_24 | 0.0 |
2 | 3 | 65_inf | 0.0 |
3 | 4 | 18_24 | 0.0 |
4 | 5 | 35_44 | 0.0 |
In [9]:
Copied!
# Признаци фильмов
display(i_features.head())
# Признаци фильмов
display(i_features.head())
item_id | title | genres | authors | year | |
---|---|---|---|---|---|
0 | 128115 | Ворон-челобитчик | Зарубежные детские книги,Сказки,Зарубежная кла... | Михаил Салтыков-Щедрин | 1886 |
1 | 210979 | Скрипка Ротшильда | Классическая проза,Литература 19 века,Русская ... | Антон Чехов | 1894 |
2 | 95632 | Испорченные дети | Зарубежная классика,Классическая проза,Литерат... | Михаил Салтыков-Щедрин | 1869 |
3 | 247906 | Странный человек | Пьесы и драматургия,Литература 19 века | Михаил Лермонтов | 1831 |
4 | 294280 | Господа ташкентцы | Зарубежная классика,Классическая проза,Литерат... | Михаил Салтыков-Щедрин | 1873 |
In [10]:
Copied!
# удаляем дубликаты, оставляя последний по времени
df = df.sort_values('start_date').drop_duplicates(subset=['user_id', 'item_id'],
keep='last')
# удаляем дубликаты, оставляя последний по времени
df = df.sort_values('start_date').drop_duplicates(subset=['user_id', 'item_id'],
keep='last')
In [11]:
Copied!
# Используем только взаимодействия у которых 30% просмотра
df = df[df['progress'] > 30]
# Отфильтруем данные
def filter_data(df, user_count=20, item_count=30):
item_counts = df.groupby('item_id')['user_id'].count()
pop_items = item_counts[item_counts >= user_count]
df_implicit = df[df['item_id'].isin(pop_items.index)]
user_counts = df.groupby('user_id')['item_id'].count()
pop_users = user_counts[user_counts >= item_count]
df = df[df['user_id'].isin(pop_users.index)].copy()
return df
df = filter_data(df,
user_count=20,
item_count=20)
df.iloc[1,:]
# Используем только взаимодействия у которых 30% просмотра
df = df[df['progress'] > 30]
# Отфильтруем данные
def filter_data(df, user_count=20, item_count=30):
item_counts = df.groupby('item_id')['user_id'].count()
pop_items = item_counts[item_counts >= user_count]
df_implicit = df[df['item_id'].isin(pop_items.index)]
user_counts = df.groupby('user_id')['item_id'].count()
pop_users = user_counts[user_counts >= item_count]
df = df[df['user_id'].isin(pop_users.index)].copy()
return df
df = filter_data(df,
user_count=20,
item_count=20)
df.iloc[1,:]
Out[11]:
user_id 85673 item_id 210979 progress 100 rating 5.0 start_date 2018-01-02 00:00:00 Name: 3536, dtype: object
In [12]:
Copied!
# Логическая проверка; в взаимодействии должны быть фильмы только из данных фичей
i_features = i_features[i_features['item_id'].isin(df['item_id'])].copy()
u_features = u_features[u_features['user_id'].isin(df['user_id'])].copy()
print(i_features.shape,u_features.shape)
# Логическая проверка; в взаимодействии должны быть фильмы только из данных фичей
i_features = i_features[i_features['item_id'].isin(df['item_id'])].copy()
u_features = u_features[u_features['user_id'].isin(df['user_id'])].copy()
print(i_features.shape,u_features.shape)
(17512, 5) (1482, 3)
In [13]:
Copied!
# Создаем новый порядок для пользователей и предметов
user_idx = df.user_id.astype('category').cat.codes
item_idx = df.item_id.astype('category').cat.codes
user2id = dict(zip(df.user_id, user_idx))
item2id = dict(zip(df.item_id, item_idx))
# Преобразуем ids
df.user_id = df.user_id.map(user2id)
df.item_id = df.item_id.map(item2id)
i_features.item_id = i_features.item_id.map(item2id)
u_features.user_id = u_features.user_id.map(user2id)
# Создаем новый порядок для пользователей и предметов
user_idx = df.user_id.astype('category').cat.codes
item_idx = df.item_id.astype('category').cat.codes
user2id = dict(zip(df.user_id, user_idx))
item2id = dict(zip(df.item_id, item_idx))
# Преобразуем ids
df.user_id = df.user_id.map(user2id)
df.item_id = df.item_id.map(item2id)
i_features.item_id = i_features.item_id.map(item2id)
u_features.user_id = u_features.user_id.map(user2id)
In [14]:
Copied!
df['user_id'] = df['user_id'].apply(lambda x: 'user ' + str(x))
df['item_id'] = df['item_id'].apply(lambda x: 'item ' + str(x))
i_features['item_id'] = i_features['item_id'].apply(lambda x: 'item ' + str(x))
u_features['user_id'] = u_features['user_id'].apply(lambda x: 'user ' + str(x))
df.head()
df['user_id'] = df['user_id'].apply(lambda x: 'user ' + str(x))
df['item_id'] = df['item_id'].apply(lambda x: 'item ' + str(x))
i_features['item_id'] = i_features['item_id'].apply(lambda x: 'item ' + str(x))
u_features['user_id'] = u_features['user_id'].apply(lambda x: 'user ' + str(x))
df.head()
Out[14]:
user_id | item_id | progress | rating | start_date | |
---|---|---|---|---|---|
3555 | user 921 | item 2116 | 100 | 5.0 | 2018-01-02 |
3536 | user 848 | item 11499 | 100 | 5.0 | 2018-01-02 |
3577 | user 848 | item 6289 | 100 | 5.0 | 2018-01-02 |
3570 | user 606 | item 6907 | 92 | NaN | 2018-01-02 |
3567 | user 921 | item 7631 | 100 | 5.0 | 2018-01-02 |
In [15]:
Copied!
# Разбиваем данные на две выборки
def train_test_split(X, user_col, time_col):
full_history = X.sort_values([user_col, time_col]).groupby(user_col)
test = full_history.tail(1)
train = full_history.head(-1)
return train, test
train, test = train_test_split(df, 'user_id', 'start_date')
# Разбиваем данные на две выборки
def train_test_split(X, user_col, time_col):
full_history = X.sort_values([user_col, time_col]).groupby(user_col)
test = full_history.tail(1)
train = full_history.head(-1)
return train, test
train, test = train_test_split(df, 'user_id', 'start_date')
In [16]:
Copied!
print('train',train.shape)
print('test',test.shape)
print('train tmin ',train['start_date'].min(),' tmax ',train['start_date'].max())
print('test tmin ',test['start_date'].min(),' tmax ',test['start_date'].max())
print('train',train.shape)
print('test',test.shape)
print('train tmin ',train['start_date'].min(),' tmax ',train['start_date'].max())
print('test tmin ',test['start_date'].min(),' tmax ',test['start_date'].max())
train (77164, 5) test (1565, 5) train tmin 2018-01-02 00:00:00 tmax 2018-12-31 00:00:00 test tmin 2018-01-30 00:00:00 tmax 2018-12-31 00:00:00
In [17]:
Copied!
# Количество уникальных фильмов которые мы можем порекомендовать после фильтрации
len(train['item_id'].unique())
# Количество уникальных фильмов которые мы можем порекомендовать после фильтрации
len(train['item_id'].unique())
Out[17]:
17148
In [18]:
Copied!
# user features
u_features.set_index('user_id', inplace=True)
# merge features into one column; merging column name with column value
u_features_list = u_features.apply(
lambda feature_values: [f'{feature}_{feature_values[feature]}' for feature in feature_values.index if not pd.isna(feature_values[feature])],
axis=1)
u_features_list = u_features_list.rename('features')
u_features_list
# user features
u_features.set_index('user_id', inplace=True)
# merge features into one column; merging column name with column value
u_features_list = u_features.apply(
lambda feature_values: [f'{feature}_{feature_values[feature]}' for feature in feature_values.index if not pd.isna(feature_values[feature])],
axis=1)
u_features_list = u_features_list.rename('features')
u_features_list
Out[18]:
user_id user 0 [age_35_44, sex_0.0] user 1 [age_18_24, sex_0.0] user 2 [age_18_24, sex_0.0] user 3 [age_65_inf, sex_1.0] user 4 [age_55_64, sex_0.0] ... user 1560 [age_18_24, sex_0.0] user 1561 [age_18_24, sex_1.0] user 1562 [age_45_54, sex_1.0] user 1563 [age_55_64, sex_1.0] user 1564 [age_35_44, sex_0.0] Name: features, Length: 1482, dtype: object
In [19]:
Copied!
# all unique combinations for user features
user_tags = set(u_features_list.explode().dropna().values)
user_tags
# all unique combinations for user features
user_tags = set(u_features_list.explode().dropna().values)
user_tags
Out[19]:
{'age_18_24', 'age_25_34', 'age_35_44', 'age_45_54', 'age_55_64', 'age_65_inf', 'sex_0.0', 'sex_1.0'}
(b) признаки предметов¶
Имея genres
, посчитаем статистику по жанрам и оставим только топ 100 жанров
In [20]:
Copied!
# i_features['item_id'].value_counts()
i_features_lfm = i_features.copy()
i_features_lfm.set_index('item_id', inplace=True) # each row represents a unique item feature
# from interactions add the number of reads of the particular item
i_features_lfm['reads'] = df.groupby('item_id')['user_id'].count()
# column genres contains genres separated by ,
i_features_lfm['genres'] = i_features_lfm['genres'].str.lower().str.split(',')
i_features_lfm['genres'] = i_features_lfm['genres'].apply(lambda x: x if isinstance(x, list) else [])
i_features_lfm.head()
# i_features['item_id'].value_counts()
i_features_lfm = i_features.copy()
i_features_lfm.set_index('item_id', inplace=True) # each row represents a unique item feature
# from interactions add the number of reads of the particular item
i_features_lfm['reads'] = df.groupby('item_id')['user_id'].count()
# column genres contains genres separated by ,
i_features_lfm['genres'] = i_features_lfm['genres'].str.lower().str.split(',')
i_features_lfm['genres'] = i_features_lfm['genres'].apply(lambda x: x if isinstance(x, list) else [])
i_features_lfm.head()
Out[20]:
title | genres | authors | year | reads | |
---|---|---|---|---|---|
item_id | |||||
item 7037 | Ворон-челобитчик | [зарубежные детские книги, сказки, зарубежная ... | Михаил Салтыков-Щедрин | 1886 | 1 |
item 11499 | Скрипка Ротшильда | [классическая проза, литература 19 века, русск... | Антон Чехов | 1894 | 1 |
item 7296 | Соседи | [зарубежные детские книги, сказки, русская кла... | Михаил Салтыков-Щедрин | NaN | 1 |
item 4125 | Ярмарка тщеславия | [зарубежная классика, зарубежная старинная лит... | Уильям Теккерей | 1848 | 3 |
item 15550 | Хитрость | [зарубежная классика, классическая проза] | Ги де Мопассан | NaN | 2 |
In [21]:
Copied!
# count the number of genre references in column genres
genres_count = i_features_lfm[['genres', 'reads']].explode('genres').groupby('genres')['reads'].sum()
# count the number of genre references in column genres
genres_count = i_features_lfm[['genres', 'reads']].explode('genres').groupby('genres')['reads'].sum()
In [22]:
Copied!
# the top genres in the user/item interactions
genres_count.sort_values(ascending=False)
# the top genres in the user/item interactions
genres_count.sort_values(ascending=False)
Out[22]:
genres любовное фэнтези 18595 попаданцы 12132 героическое фэнтези 9038 современные детективы 8018 магические академии 6674 ... растениеводство 1 коммерческое право 1 молитвы в исламе 1 международное право 1 астрономия 1 Name: reads, Length: 455, dtype: int64
In [23]:
Copied!
# item_tags : top n genres by read count
n = 100
item_tags = genres_count.sort_values(ascending=False)[:n].index
item_tags[:30]
# item_tags : top n genres by read count
n = 100
item_tags = genres_count.sort_values(ascending=False)[:n].index
item_tags[:30]
Out[23]:
Index(['любовное фэнтези', 'попаданцы', 'героическое фэнтези', 'современные детективы', 'магические академии', 'боевая фантастика', 'книги про волшебников', 'боевое фэнтези', 'зарубежные любовные романы', 'современные любовные романы', 'иронические детективы', 'юмористическое фэнтези', 'остросюжетные любовные романы', 'космическая фантастика', 'современная русская литература', 'короткие любовные романы', 'зарубежные детективы', 'детективное фэнтези', 'триллеры', 'городское фэнтези', 'любовно-фантастические романы', 'историческая фантастика', 'научная фантастика', 'эротические романы', 'полицейские детективы', 'героическая фантастика', 'мистика', 'фэнтези про драконов', 'социальная фантастика', 'исторические детективы'], dtype='object', name='genres')
In [24]:
Copied!
# filter the column genres list to include only the top 50 genres
def filter_genres(genres_list, valid_genres=None):
if not genres_list:
return []
return [genre for genre in genres_list if genre in valid_genres]
i_features_lfm['features'] = i_features_lfm['genres'].apply(filter_genres, valid_genres=set(item_tags))
i_features_lfm.head()
# filter the column genres list to include only the top 50 genres
def filter_genres(genres_list, valid_genres=None):
if not genres_list:
return []
return [genre for genre in genres_list if genre in valid_genres]
i_features_lfm['features'] = i_features_lfm['genres'].apply(filter_genres, valid_genres=set(item_tags))
i_features_lfm.head()
Out[24]:
title | genres | authors | year | reads | features | |
---|---|---|---|---|---|---|
item_id | ||||||
item 7037 | Ворон-челобитчик | [зарубежные детские книги, сказки, зарубежная ... | Михаил Салтыков-Щедрин | 1886 | 1 | [зарубежные детские книги, сказки, зарубежная ... |
item 11499 | Скрипка Ротшильда | [классическая проза, литература 19 века, русск... | Антон Чехов | 1894 | 1 | [классическая проза, литература 19 века, русск... |
item 7296 | Соседи | [зарубежные детские книги, сказки, русская кла... | Михаил Салтыков-Щедрин | NaN | 1 | [зарубежные детские книги, сказки, русская кла... |
item 4125 | Ярмарка тщеславия | [зарубежная классика, зарубежная старинная лит... | Уильям Теккерей | 1848 | 3 | [зарубежная классика, классическая проза, лите... |
item 15550 | Хитрость | [зарубежная классика, классическая проза] | Ги де Мопассан | NaN | 2 | [зарубежная классика, классическая проза] |
In [25]:
Copied!
i_features_list = i_features_lfm['features']
i_features_list
i_features_list = i_features_lfm['features']
i_features_list
Out[25]:
item_id item 7037 [зарубежные детские книги, сказки, зарубежная ... item 11499 [классическая проза, литература 19 века, русск... item 7296 [зарубежные детские книги, сказки, русская кла... item 4125 [зарубежная классика, классическая проза, лите... item 15550 [зарубежная классика, классическая проза] ... item 14903 [эротические романы, короткие любовные романы,... item 12570 [политология, газеты] item 3146 [журнальные издания] item 16184 [политология, газеты] item 3477 [газеты] Name: features, Length: 17512, dtype: object
In [26]:
Copied!
print(user_tags)
print(item_tags[:10])
print(user_tags)
print(item_tags[:10])
{'sex_0.0', 'age_35_44', 'age_25_34', 'age_55_64', 'age_45_54', 'age_65_inf', 'age_18_24', 'sex_1.0'} Index(['любовное фэнтези', 'попаданцы', 'героическое фэнтези', 'современные детективы', 'магические академии', 'боевая фантастика', 'книги про волшебников', 'боевое фэнтези', 'зарубежные любовные романы', 'современные любовные романы'], dtype='object', name='genres')
In [27]:
Copied!
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
# unique value in interactions (user,items)
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
# мапперы для id LightFM и id данных
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
'''
Build features
'''
sparse_i_features = lfm_dataset.build_item_features([[row.item_id, row.features] for row in i_features_list.reset_index().itertuples()])
sparse_u_features = lfm_dataset.build_user_features([[row.user_id, row.features] for row in u_features_list.reset_index().itertuples()])
(interactions, weights) = lfm_dataset.build_interactions([(row.user_id, row.item_id, row.progress) for row in train.itertuples()])
print('interactions',interactions.shape)
print('interactions data',interactions.data[:10])
print('interactions weight', weights.data[:10])
'''
Train Model
'''
lightfm = LightFM(no_components=20,
loss='warp')
lightfm.fit(interactions,
user_features=sparse_u_features,
item_features=sparse_i_features,
epochs=40, num_threads=8)
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
# unique value in interactions (user,items)
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
# мапперы для id LightFM и id данных
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
'''
Build features
'''
sparse_i_features = lfm_dataset.build_item_features([[row.item_id, row.features] for row in i_features_list.reset_index().itertuples()])
sparse_u_features = lfm_dataset.build_user_features([[row.user_id, row.features] for row in u_features_list.reset_index().itertuples()])
(interactions, weights) = lfm_dataset.build_interactions([(row.user_id, row.item_id, row.progress) for row in train.itertuples()])
print('interactions',interactions.shape)
print('interactions data',interactions.data[:10])
print('interactions weight', weights.data[:10])
'''
Train Model
'''
lightfm = LightFM(no_components=20,
loss='warp')
lightfm.fit(interactions,
user_features=sparse_u_features,
item_features=sparse_i_features,
epochs=40, num_threads=8)
interactions (1565, 17512) interactions data [1 1 1 1 1 1 1 1 1 1] interactions weight [ 64. 100. 35. 42. 100. 100. 100. 100. 100. 100.]
Out[27]:
<lightfm.lightfm.LightFM at 0x7c479fc3b790>
In [28]:
Copied!
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values()), # Все item_id LightFM
user_features=sparse_u_features,
item_features=sparse_i_features)
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values()), # Все item_id LightFM
user_features=sparse_u_features,
item_features=sparse_i_features)
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
In [29]:
Copied!
lst_inference = []
# Для всех user_id из train (user X)
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm,
user_id=user,
top_k=20))
df_inference = pd.concat(lst_inference)
lst_inference = []
# Для всех user_id из train (user X)
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm,
user_id=user,
top_k=20))
df_inference = pd.concat(lst_inference)
100%|██████████| 1565/1565 [00:07<00:00, 196.68it/s]
In [30]:
Copied!
df_inference.head()
df_inference.head()
Out[30]:
y_pred | relevance | user_id | |
---|---|---|---|
0 | 10 | -47.367077 | 0 |
1 | 1542 | -47.349598 | 0 |
2 | 6356 | -47.328232 | 0 |
3 | 930 | -47.339104 | 0 |
4 | 2938 | -47.288143 | 0 |
In [31]:
Copied!
# Преобразует LightFM item_id -> df item_id
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
# Преобразует LightFM user_id -> df user_id
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference['y_pred'] = df_inference.apply(map_predictions,axis=1)
df_inference['user_id'] = df_inference.apply(map_users,axis=1)
# aggregations for [user_id]
agg_inf = df_inference.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf = agg_inf.merge(agg_train,left_index=True,right_index=True)
merged_inf.head()
# Преобразует LightFM item_id -> df item_id
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
# Преобразует LightFM user_id -> df user_id
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference['y_pred'] = df_inference.apply(map_predictions,axis=1)
df_inference['user_id'] = df_inference.apply(map_users,axis=1)
# aggregations for [user_id]
agg_inf = df_inference.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf = agg_inf.merge(agg_train,left_index=True,right_index=True)
merged_inf.head()
Out[31]:
y_pred | item_id | |
---|---|---|
user_id | ||
user 0 | [item 1729, item 10171, item 3252, item 9847, ... | [item 8391, item 11677, item 9991, item 13581,... |
user 1 | [item 260, item 10469, item 2469, item 3398, i... | [item 15894, item 10185, item 7475, item 13534... |
user 10 | [item 15456, item 9481, item 4596, item 8656, ... | [item 13448, item 2236, item 5657, item 14069,... |
user 100 | [item 14037, item 13055, item 8543, item 14616... | [item 11373, item 7555, item 8133, item 2662, ... |
user 1000 | [item 17039, item 13021, item 8513, item 669, ... | [item 9829, item 5744, item 10201, item 7491, ... |
In [32]:
Copied!
# Посчитаем precision
print('precision k@20:',round(precision(merged_inf,
pred_col='y_pred',
true_col='item_id',
k=20),4))
# Посчитаем precision
print('precision k@20:',round(precision(merged_inf,
pred_col='y_pred',
true_col='item_id',
k=20),4))
precision k@20: 0.1853
In [33]:
Copied!
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
print('interactions shape')
print(lfm_dataset.interactions_shape())
'''
Build features (Interactions Only)
'''
# train - interactions dataset
(interactions, weights) = lfm_dataset.build_interactions([(row.user_id, row.item_id, row.progress) for row in train.itertuples()])
print('interactions',interactions.shape)
print('interactions data',interactions.data[:10])
print('interactions weight', weights.data[:10])
'''
Train Model
'''
lightfm_inter = LightFM(no_components=20,
loss='warp')
lightfm_inter.fit(interactions,
epochs=40,
num_threads=8)
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
print('interactions shape')
print(lfm_dataset.interactions_shape())
'''
Build features (Interactions Only)
'''
# train - interactions dataset
(interactions, weights) = lfm_dataset.build_interactions([(row.user_id, row.item_id, row.progress) for row in train.itertuples()])
print('interactions',interactions.shape)
print('interactions data',interactions.data[:10])
print('interactions weight', weights.data[:10])
'''
Train Model
'''
lightfm_inter = LightFM(no_components=20,
loss='warp')
lightfm_inter.fit(interactions,
epochs=40,
num_threads=8)
interactions shape (1565, 17512) interactions (1565, 17512) interactions data [1 1 1 1 1 1 1 1 1 1] interactions weight [ 64. 100. 35. 42. 100. 100. 100. 100. 100. 100.]
Out[33]:
<lightfm.lightfm.LightFM at 0x7c479ae5fa00>
In [34]:
Copied!
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values())) # Все item_id LightFM
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values())) # Все item_id LightFM
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
In [35]:
Copied!
lst_inference = []
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm_inter,
user_id=user,
top_k=20))
df_inference_inter = pd.concat(lst_inference)
lst_inference = []
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm_inter,
user_id=user,
top_k=20))
df_inference_inter = pd.concat(lst_inference)
100%|██████████| 1565/1565 [00:06<00:00, 248.41it/s]
In [36]:
Copied!
df_inference_inter.head()
df_inference_inter.head()
Out[36]:
y_pred | relevance | user_id | |
---|---|---|---|
0 | 2956 | 1.709743 | 0 |
1 | 111 | 1.717322 | 0 |
2 | 61 | 1.762205 | 0 |
3 | 272 | 1.861350 | 0 |
4 | 3001 | 1.842448 | 0 |
In [37]:
Copied!
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference_inter['y_pred'] = df_inference_inter.apply(map_predictions,axis=1)
df_inference_inter['user_id'] = df_inference_inter.apply(map_users,axis=1)
df_inference_inter.head()
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference_inter['y_pred'] = df_inference_inter.apply(map_predictions,axis=1)
df_inference_inter['user_id'] = df_inference_inter.apply(map_users,axis=1)
df_inference_inter.head()
Out[37]:
y_pred | relevance | user_id | |
---|---|---|---|
0 | item 16997 | 1.709743 | user 921 |
1 | item 16554 | 1.717322 | user 921 |
2 | item 1573 | 1.762205 | user 921 |
3 | item 2298 | 1.861350 | user 921 |
4 | item 12542 | 1.842448 | user 921 |
In [38]:
Copied!
agg_inf_inter = df_inference_inter.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf_inter = agg_inf_inter.merge(agg_train,left_index=True,right_index=True)
agg_inf_inter = df_inference_inter.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf_inter = agg_inf_inter.merge(agg_train,left_index=True,right_index=True)
In [39]:
Copied!
print('precision k@20:',round(precision(merged_inf_inter,
pred_col='y_pred',
true_col='item_id',
k=20),4))
print('precision k@20:',round(precision(merged_inf_inter,
pred_col='y_pred',
true_col='item_id',
k=20),4))
precision k@20: 0.4533
In [40]:
Copied!
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
num_users, num_items = lfm_dataset.interactions_shape()
'''
Build features
'''
sparse_i_features = lfm_dataset.build_item_features([[row.item_id, row.features] for row in i_features_list.reset_index().itertuples()])
sparse_u_features = lfm_dataset.build_user_features([[row.user_id, row.features] for row in u_features_list.reset_index().itertuples()])
'''
Train Model
'''
dummy = csr_matrix(np.zeros((num_users, num_items)))
lightfm_feat = LightFM(no_components=20,
loss='warp')
lightfm_feat.fit(interactions=dummy,
user_features=sparse_u_features,
item_features=sparse_i_features,
epochs=40)
'''
Create LightFM dataset
'''
lfm_dataset = LFMDataset()
lfm_dataset.fit_partial(users=df['user_id'].unique(),
items=df['item_id'].unique())
lfm_dataset.fit_partial(user_features=user_tags,
item_features=item_tags)
user_mapping, item_mapping = lfm_dataset.mapping()[0], lfm_dataset.mapping()[2]
inv_user_mapping = {value: key for key, value in user_mapping.items()}
inv_item_mapping = {value: key for key, value in item_mapping.items()}
num_users, num_items = lfm_dataset.interactions_shape()
'''
Build features
'''
sparse_i_features = lfm_dataset.build_item_features([[row.item_id, row.features] for row in i_features_list.reset_index().itertuples()])
sparse_u_features = lfm_dataset.build_user_features([[row.user_id, row.features] for row in u_features_list.reset_index().itertuples()])
'''
Train Model
'''
dummy = csr_matrix(np.zeros((num_users, num_items)))
lightfm_feat = LightFM(no_components=20,
loss='warp')
lightfm_feat.fit(interactions=dummy,
user_features=sparse_u_features,
item_features=sparse_i_features,
epochs=40)
Out[40]:
<lightfm.lightfm.LightFM at 0x7c479ae5fd30>
In [41]:
Copied!
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values()), # Все item_id LightFM
user_features=sparse_u_features,
item_features=sparse_i_features,num_threads=16)
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
def inference(model,
user_id,
top_k=10,
user_features=None,
item_features=None):
# user_ids -> LightFM id
pred = model.predict(user_ids=user_id, # user_id LightFM
item_ids=sorted(item_mapping.values()), # Все item_id LightFM
user_features=sparse_u_features,
item_features=sparse_i_features,num_threads=16)
k = top_k
ids = np.argpartition(pred, - k)[- k:]
rel = pred[ids]
res = pd.DataFrame(zip(ids, rel),
columns=['y_pred', 'relevance'])
# res['item_id'] = res['item_id'].map(inv_item_mapping)
res['user_id'] = [user_id for i in range(0,res.shape[0])]
return res
In [42]:
Copied!
lst_inference = []
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm_feat,
user_id=user,
top_k=20))
df_inference_feat = pd.concat(lst_inference)
lst_inference = []
for user in tqdm(sorted(user_mapping.values())):
lst_inference.append(inference(model=lightfm_feat,
user_id=user,
top_k=20))
df_inference_feat = pd.concat(lst_inference)
100%|██████████| 1565/1565 [00:06<00:00, 253.49it/s]
In [43]:
Copied!
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference_feat['y_pred'] = df_inference_feat.apply(map_predictions,axis=1)
df_inference_feat['user_id'] = df_inference_feat.apply(map_users,axis=1)
# aggregations for [user_id]
agg_inf_feat = df_inference_feat.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf_feat = agg_inf_feat.merge(agg_train,left_index=True,right_index=True)
merged_inf_feat.head()
def map_predictions(x):
return inv_item_mapping[x['y_pred']]
def map_users(x):
return inv_user_mapping[x['user_id']]
df_inference_feat['y_pred'] = df_inference_feat.apply(map_predictions,axis=1)
df_inference_feat['user_id'] = df_inference_feat.apply(map_users,axis=1)
# aggregations for [user_id]
agg_inf_feat = df_inference_feat.groupby('user_id')['y_pred'].apply(lambda x: list(x)).to_frame()
agg_train = train.groupby('user_id')['item_id'].apply(lambda x: list(x)).to_frame()
merged_inf_feat = agg_inf_feat.merge(agg_train,left_index=True,right_index=True)
merged_inf_feat.head()
Out[43]:
y_pred | item_id | |
---|---|---|
user_id | ||
user 0 | [item 12733, item 17326, item 16302, item 1238... | [item 8391, item 11677, item 9991, item 13581,... |
user 1 | [item 13967, item 13915, item 8750, item 9356,... | [item 15894, item 10185, item 7475, item 13534... |
user 10 | [item 16041, item 3718, item 4034, item 4323, ... | [item 13448, item 2236, item 5657, item 14069,... |
user 100 | [item 11072, item 13483, item 5651, item 15967... | [item 11373, item 7555, item 8133, item 2662, ... |
user 1000 | [item 12384, item 13038, item 17507, item 3081... | [item 9829, item 5744, item 10201, item 7491, ... |
In [44]:
Copied!
print('precision k@20:',round(precision(merged_inf_feat,
pred_col='y_pred',
true_col='item_id',
k=20),4))
print('precision k@20:',round(precision(merged_inf_feat,
pred_col='y_pred',
true_col='item_id',
k=20),4))
precision k@20: 0.0008