In [1]:

Copied!

!pip install implicit -qqq
!pip install catboost -qqq
!pip install implicit -qqq
!pip install catboost -qqq

In [2]:

Copied!

!ls /kaggle/input/kion-data/
!ls /kaggle/input/kion-data/

interactions_df.csv  items.csv	users.csv

Двухэтапная модель¶

1 | Введение
¶

In [3]:

Copied!





import datetime
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import scipy.sparse as sparse

from catboost import CatBoostClassifier
import implicit

import warnings
warnings.simplefilter('ignore')
import datetime
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import scipy.sparse as sparse

from catboost import CatBoostClassifier
import implicit

import warnings
warnings.simplefilter('ignore')

2 | Датасет КИОН
¶

просмотры фильмов и сериалов в приложении KION
данные о пользователях
данные об айтемах (фильмы, сериалы и т.д.)

In [4]:

Copied!

interactions = pd.read_csv("/kaggle/input/kion-data/interactions_df.csv")
items = pd.read_csv("/kaggle/input/kion-data/items.csv")
users = pd.read_csv("/kaggle/input/kion-data/users.csv")
interactions = pd.read_csv("/kaggle/input/kion-data/interactions_df.csv")
items = pd.read_csv("/kaggle/input/kion-data/items.csv")
users = pd.read_csv("/kaggle/input/kion-data/users.csv")

(A) Данные о взаимодействий пользователях с фильмами¶

Как и ранше у нас стандартые фичи

user_id : пользователь
item_id : фильм

У нас есть фича времени последнего запуска фильма

last_watch_dt дата просмотра

И фичи взаимодействий:

total_dur длительность просмотра
watched_pct доля просмотра в %

In [5]:

Copied!

# деятельность пользователей 
interactions.head()
# деятельность пользователей 
interactions.head()

Out[5]:

	user_id	item_id	last_watch_dt	total_dur	watched_pct
0	176549	9506	2021-05-11	4250	72.0
1	699317	1659	2021-05-29	8317	100.0
2	656683	7107	2021-05-09	10	0.0
3	864613	7638	2021-07-05	14483	100.0
4	964868	9506	2021-04-30	6725	100.0

In [6]:

Copied!

# change the string to date
interactions['last_watch_dt'] = interactions['last_watch_dt'].astype('datetime64[ns]').dt.date
# change the string to date
interactions['last_watch_dt'] = interactions['last_watch_dt'].astype('datetime64[ns]').dt.date

In [7]:

Copied!

# check the unique number of films and users
print(f"Уникальных юзеров в interactions: {interactions['user_id'].nunique()}")
print(f"Уникальных айтемов в interactions: {interactions['item_id'].nunique()}")
# check the unique number of films and users
print(f"Уникальных юзеров в interactions: {interactions['user_id'].nunique()}")
print(f"Уникальных айтемов в interactions: {interactions['item_id'].nunique()}")

Уникальных юзеров в interactions: 962179
Уникальных айтемов в interactions: 15706

In [8]:

Copied!





# check the all user activity limits
max_date = interactions['last_watch_dt'].max()
min_date = interactions['last_watch_dt'].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")
# check the all user activity limits
max_date = interactions['last_watch_dt'].max()
min_date = interactions['last_watch_dt'].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")

min дата в interactions: 2021-03-13
max дата в interactions: 2021-08-22

(B) Данные о пользователях¶

Все признаки - результат предсказания соцдем моделей

age : Возростная группа
income : Группа дохода пользователя
sex : Пол пользователя
kids_flg : флаг наличия детей

In [9]:

Copied!

users.head()
users.head()

Out[9]:

	user_id	age	income	sex	kids_flg
0	973171	age_25_34	income_60_90	М	1
1	962099	age_18_24	income_20_40	М	0
2	1047345	age_45_54	income_40_60	Ж	0
3	721985	age_45_54	income_20_40	Ж	0
4	704055	age_35_44	income_60_90	Ж	0

In [10]:

Copied!

# number of unique users in database
f"Уникальных юзеров в users: {users.shape[0]}"
# number of unique users in database
f"Уникальных юзеров в users: {users.shape[0]}"

Out[10]:

'Уникальных юзеров в users: 840197'

(C) Данные о фильмах¶

content_type - тип контента
title - название на русском
title_orig - название оригинальное
release_year - год выпуска
countries - страны
for_kids - флаг контент для детей
age_rating- Возрастной рейтинг
studios - студии
directors - режиссеры
actors- актеры
keywords - ключевые слова
description - описание

In [11]:

Copied!

items.head(3)
items.head(3)

Out[11]:

	item_id	content_type	title	title_orig	release_year	genres	countries	for_kids	age_rating	studios	directors	actors	description	keywords
0	10711	film	Поговори с ней	Hable con ella	2002.0	драмы, зарубежные, детективы, мелодрамы	Испания	NaN	16.0	NaN	Педро Альмодовар	Адольфо Фернандес, Ана Фернандес, Дарио Гранди...	Мелодрама легендарного Педро Альмодовара «Пого...	Поговори, ней, 2002, Испания, друзья, любовь, ...
1	2508	film	Голые перцы	Search Party	2014.0	зарубежные, приключения, комедии	США	NaN	16.0	NaN	Скот Армстронг	Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...	Уморительная современная комедия на популярную...	Голые, перцы, 2014, США, друзья, свадьбы, прео...
2	10716	film	Тактическая сила	Tactical Force	2011.0	криминал, зарубежные, триллеры, боевики, комедии	Канада	NaN	16.0	NaN	Адам П. Калтраро	Адриан Холмс, Даррен Шалави, Джерри Вассерман,...	Профессиональный рестлер Стив Остин («Все или ...	Тактическая, сила, 2011, Канада, бандиты, ганг...

In [12]:

Copied!

f"Уникальных айтемов в items {items.shape[0]}"
f"Уникальных айтемов в items {items.shape[0]}"

Out[12]:

'Уникальных айтемов в items 15963'

3 | Разбиение на Подвыборки
¶

Для двухуровних моделях есть нюанс при разбиение на подвыборки

Модель первого уровня нужна для предварительного отбора кандидатов

На test будем проверять результат обоих моделей:

отдельно модели 1го уровня
отдельно двухуровненой модели

На test оставим 7 дней

In [13]:

Copied!

# фильтрация для исключения случайных просмотров
interactions = interactions[interactions['total_dur'] >= 300]
# фильтрация для исключения случайных просмотров
interactions = interactions[interactions['total_dur'] >= 300]

In [14]:

Copied!

user_interactions_count = interactions.groupby('user_id')[['item_id']].count().reset_index()
user_interactions_count = interactions.groupby('user_id')[['item_id']].count().reset_index()

In [15]:

Copied!

user_interactions_count[user_interactions_count['item_id'] < 50]['item_id'].hist(bins=30)
plt.show()
user_interactions_count[user_interactions_count['item_id'] < 50]['item_id'].hist(bins=30)
plt.show()

No description has been provided for this image

In [16]:

Copied!

sum(user_interactions_count['item_id'] < 10) / user_interactions_count.shape[0]
sum(user_interactions_count['item_id'] < 10) / user_interactions_count.shape[0]

Out[16]:

0.8652543241717588

In [17]:

Copied!

filtered_users = user_interactions_count[user_interactions_count['item_id'] >= 10][['user_id']]
interactions = filtered_users.merge(interactions, how='left')
filtered_users = user_interactions_count[user_interactions_count['item_id'] >= 10][['user_id']]
interactions = filtered_users.merge(interactions, how='left')

In [18]:

Copied!

item_interactions_count = interactions.groupby('item_id')[['user_id']].count().reset_index()
item_interactions_count = interactions.groupby('item_id')[['user_id']].count().reset_index()

In [19]:

Copied!

item_interactions_count[item_interactions_count['user_id'] < 100]['user_id'].hist(bins=30)
plt.show()
item_interactions_count[item_interactions_count['user_id'] < 100]['user_id'].hist(bins=30)
plt.show()

In [20]:

Copied!

sum(item_interactions_count['user_id'] < 10) / item_interactions_count.shape[0]
sum(item_interactions_count['user_id'] < 10) / item_interactions_count.shape[0]

Out[20]:

0.5165371809100999

In [21]:

Copied!

filtered_items = item_interactions_count[item_interactions_count['user_id'] >= 10][['item_id']]
interactions = filtered_items.merge(interactions, how='left')
filtered_items = item_interactions_count[item_interactions_count['user_id'] >= 10][['item_id']]
interactions = filtered_items.merge(interactions, how='left')

In [22]:

Copied!

interactions.shape
interactions.shape

Out[22]:

(2300516, 5)

In [23]:

Copied!





test_threshold = max_date - pd.Timedelta(days=7)
val_threshold = test_threshold - pd.Timedelta(days=60) # два месяца для обучения модели второго уровня

test = interactions[(interactions['last_watch_dt'] >= test_threshold)]
train_val = interactions[(interactions['last_watch_dt'] < test_threshold)]

val = train_val[(train_val['last_watch_dt'] >= val_threshold)]
train = train_val[(train_val['last_watch_dt'] < val_threshold)]

print(f"train: {train.shape}")
print(f"val: {val.shape}")
print(f"test: {test.shape}")
test_threshold = max_date - pd.Timedelta(days=7)
val_threshold = test_threshold - pd.Timedelta(days=60) # два месяца для обучения модели второго уровня

test = interactions[(interactions['last_watch_dt'] >= test_threshold)]
train_val = interactions[(interactions['last_watch_dt'] < test_threshold)]

val = train_val[(train_val['last_watch_dt'] >= val_threshold)]
train = train_val[(train_val['last_watch_dt'] < val_threshold)]

print(f"train: {train.shape}")
print(f"val: {val.shape}")
print(f"test: {test.shape}")

train: (881660, 5)
val: (1246263, 5)
test: (172593, 5)

5 | 1 этап: Модель Первого Уровня
¶

обучаем bpr модель из библиотеки implicit на train
делаем предикт c кандидатами на val

Будем предсказывать кандидатов только на теплых пользователях - у которых есть просмотры в обучающей выборке

In [24]:

Copied!

val = val[val['user_id'].isin(train['user_id'].unique())]
val = val[val['user_id'].isin(train['user_id'].unique())]

In [25]:

Copied!

val.head()
val.head()

Out[25]:

	user_id	last_watch_dt	total_dur	watched_pct
1	184560	2021-07-18	13278	50.0
3	379871	2021-07-04	4279	3.0
5	427911	2021-08-11	5363	38.0
7	573405	2021-08-01	10164	9.0
15	968581	2021-08-04	7310	6.0

In [26]:

Copied!





users_id = list(np.sort(train.user_id.unique()))
items_train = list(train.item_id.unique())
ratings_train = list(train.watched_pct)

rows_train = train.user_id.astype('category').cat.codes
cols_train = train.item_id.astype('category').cat.codes

train_sparse = sparse.csr_matrix((ratings_train, (rows_train, cols_train)), shape=(len(users_id), len(items_train)))
users_id = list(np.sort(train.user_id.unique()))
items_train = list(train.item_id.unique())
ratings_train = list(train.watched_pct)

rows_train = train.user_id.astype('category').cat.codes
cols_train = train.item_id.astype('category').cat.codes

train_sparse = sparse.csr_matrix((ratings_train, (rows_train, cols_train)), shape=(len(users_id), len(items_train)))

In [27]:

Copied!





matrix_size = train_sparse.shape[0] * train_sparse.shape[1] # Number of possible interactions in the matrix
num_interactions = len(train_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100 * (1 - (num_interactions / matrix_size))
sparsity
matrix_size = train_sparse.shape[0] * train_sparse.shape[1] # Number of possible interactions in the matrix
num_interactions = len(train_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100 * (1 - (num_interactions / matrix_size))
sparsity

Out[27]:

99.79592704093007

In [28]:

Copied!

train_sparse
train_sparse

Out[28]:

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 881660 stored elements and shape (72354, 5860)>

In [29]:

Copied!

algo = implicit.bpr.BayesianPersonalizedRanking(factors=50, regularization=0.01, iterations=50, use_gpu=False)
algo.fit((train_sparse).astype('double'))
algo = implicit.bpr.BayesianPersonalizedRanking(factors=50, regularization=0.01, iterations=50, use_gpu=False)
algo.fit((train_sparse).astype('double'))

In [30]:

Copied!

user_vecs = algo.user_factors
item_vecs = algo.item_factors
user_vecs = algo.user_factors
item_vecs = algo.item_factors

In [31]:

Copied!

k = 30
k = 30

In [32]:

Copied!





def predict(user_vecs, item_vecs, k=10):
    id2user = dict(zip(rows_train, train.user_id))
    id2item = dict(zip(cols_train, train.item_id))

    scores = user_vecs.dot(item_vecs.T)

    ind_part = np.argpartition(scores, -k + 1)[:, -k:].copy()
    scores_not_sorted = np.take_along_axis(scores, ind_part, axis=1)
    ind_sorted = np.argsort(scores_not_sorted, axis=1)
    indices = np.take_along_axis(ind_part, ind_sorted, axis=1)
    indices = np.flip(indices, 1)
    preds = pd.DataFrame({
        'user_id': range(user_vecs.shape[0]),
        'preds': indices.tolist(),
        })
    preds['user_id'] = preds['user_id'].map(id2user)
    preds['preds'] = preds['preds'].map(lambda inds: [id2item[i] for i in inds])
    return preds
def predict(user_vecs, item_vecs, k=10):
    id2user = dict(zip(rows_train, train.user_id))
    id2item = dict(zip(cols_train, train.item_id))

    scores = user_vecs.dot(item_vecs.T)

    ind_part = np.argpartition(scores, -k + 1)[:, -k:].copy()
    scores_not_sorted = np.take_along_axis(scores, ind_part, axis=1)
    ind_sorted = np.argsort(scores_not_sorted, axis=1)
    indices = np.take_along_axis(ind_part, ind_sorted, axis=1)
    indices = np.flip(indices, 1)
    preds = pd.DataFrame({
        'user_id': range(user_vecs.shape[0]),
        'preds': indices.tolist(),
        })
    preds['user_id'] = preds['user_id'].map(id2user)
    preds['preds'] = preds['preds'].map(lambda inds: [id2item[i] for i in inds])
    return preds

In [33]:

Copied!

val_user_history = val.groupby('user_id')[['item_id']].agg(lambda x: list(x))
pred_bpr = predict(user_vecs, item_vecs, k)
pred_bpr = val_user_history.merge(pred_bpr, how='left', on='user_id')
val_user_history = val.groupby('user_id')[['item_id']].agg(lambda x: list(x))
pred_bpr = predict(user_vecs, item_vecs, k)
pred_bpr = val_user_history.merge(pred_bpr, how='left', on='user_id')

In [34]:

Copied!

pred_bpr.head()
pred_bpr.head()

Out[34]:

	user_id	item_id	preds
0	2	[242, 3628, 5819, 7106, 7921, 8482, 9164, 1077...	[3166, 8482, 12965, 4072, 11749, 1267, 16382, ...
1	21	[308, 3784, 4495, 5077, 6384, 7102, 7571, 8251...	[849, 11237, 24, 1053, 7713, 13936, 7417, 8524...
2	30	[1107, 2346, 2743, 3031, 7250, 9728, 9842, 112...	[13865, 4740, 10464, 142, 16201, 3017, 12396, ...
3	46	[10440]	[4880, 142, 4151, 9996, 8636, 13865, 1465, 474...
4	60	[1179, 1343, 1590, 3550, 6044, 6606, 8612, 972...	[4880, 13865, 4151, 2657, 1083, 1449, 7107, 29...

In [35]:

Copied!





def recall(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    recall_values = []
    for _, row in df.iterrows():
      num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
      num_true = len(row[true_col])
      recall_values.append(num_relevant / num_true)
    return np.mean(recall_values)
def recall(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    recall_values = []
    for _, row in df.iterrows():
      num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
      num_true = len(row[true_col])
      recall_values.append(num_relevant / num_true)
    return np.mean(recall_values)

In [36]:

Copied!





def precision(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    precision_values = []
    for _, row in df.iterrows():
      num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
      num_true = min(k, len(row[true_col]))
      precision_values.append(num_relevant / num_true)
    return np.mean(precision_values)
def precision(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    precision_values = []
    for _, row in df.iterrows():
      num_relevant = len(set(row[true_col]) & set(row[pred_col][:k]))
      num_true = min(k, len(row[true_col]))
      precision_values.append(num_relevant / num_true)
    return np.mean(precision_values)

In [37]:

Copied!





def mrr(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    mrr_values = []
    for _, row in df.iterrows():
      intersection = set(row[true_col]) & set(row[pred_col][:k])
      user_mrr = 0
      if len(intersection) > 0:
          for item in intersection:
              user_mrr = max(user_mrr, 1 / (row[pred_col].index(item) + 1))
      mrr_values.append(user_mrr)
    return np.mean(mrr_values)
def mrr(df: pd.DataFrame, pred_col='preds', true_col='item_id', k=30) -> float:
    mrr_values = []
    for _, row in df.iterrows():
      intersection = set(row[true_col]) & set(row[pred_col][:k])
      user_mrr = 0
      if len(intersection) > 0:
          for item in intersection:
              user_mrr = max(user_mrr, 1 / (row[pred_col].index(item) + 1))
      mrr_values.append(user_mrr)
    return np.mean(mrr_values)

In [38]:

Copied!

recall(pred_bpr)
recall(pred_bpr)

Out[38]:

0.10971724310666944

In [39]:

Copied!

precision(pred_bpr)
precision(pred_bpr)

Out[39]:

0.1108938431397026

In [40]:

Copied!

mrr(pred_bpr)
mrr(pred_bpr)

Out[40]:

0.12739334967701704

In [41]:

Copied!

# генерируем предказания

candidates = pred_bpr[['user_id', 'preds']]
candidates = candidates.explode('preds').rename(columns={'preds': 'item_id'})
candidates['rank'] = candidates.groupby('user_id').cumcount() + 1

candidates.head()
# генерируем предказания

candidates = pred_bpr[['user_id', 'preds']]
candidates = candidates.explode('preds').rename(columns={'preds': 'item_id'})
candidates['rank'] = candidates.groupby('user_id').cumcount() + 1

candidates.head()

Out[41]:

user_id	item_id	rank
2	3166	1
2	8482	2
2	12965	3
2	4072	4
2	11749	5

6 | 2 этап: Обучаем Бустинг
¶

Ставим задачу как бинарную классификацию
Выбираем CatBoostClassifier, так как много категориальных признаков

Собираем train + target¶

target = 1 Позитивные взаимодействия = все что пользователь посмотрел из кандидатов bpr

In [42]:

Copied!





pos = candidates.merge(val,
                       on=['user_id', 'item_id'],
                       how='inner')

pos['target'] = 1
print(pos.shape)
pos.head()
pos = candidates.merge(val,
                       on=['user_id', 'item_id'],
                       how='inner')

pos['target'] = 1
print(pos.shape)
pos.head()

(60919, 7)

Out[42]:

	user_id	item_id	rank	last_watch_dt	total_dur	watched_pct	target
0	2	8482	2	2021-06-18	5886	100.0	1
1	2	9164	9	2021-06-23	6650	100.0	1
2	46	10440	13	2021-07-05	7449	20.0	1
3	60	15297	25	2021-07-24	14896	88.0	1
4	81	9996	7	2021-06-19	2939	10.0	1

target = 0 Негативные взаимодействия = все что пользователь НЕ посмотрел из кандидатов bpr

добавим сэмплирование, чтобы соблюсти баланс классов

In [43]:

Copied!

neg = candidates.set_index(['user_id', 'item_id'])\
        .join(val.set_index(['user_id', 'item_id']))

neg = neg[neg['watched_pct'].isnull()].reset_index()
neg = candidates.set_index(['user_id', 'item_id'])\
        .join(val.set_index(['user_id', 'item_id']))

neg = neg[neg['watched_pct'].isnull()].reset_index()

Формируем трейн и тест: ctb_train, ctb_test¶

Важно! Делим по пользователям, а не по дате.¶

Мотивация:

Для негативных взаимодействий нет даты
Проверяем устойчивость бустинга относительно новых (холодных) пользователей

In [44]:

Copied!

pos.shape[0] / neg.shape[0]
pos.shape[0] / neg.shape[0]

Out[44]:

0.03351468161905197

In [45]:

Copied!

# пусть негативных семплов будет в два раза больше, чем позитивных
neg = neg.sample(frac=0.07)
neg['target'] = 0

neg.shape
# пусть негативных семплов будет в два раза больше, чем позитивных
neg = neg.sample(frac=0.07)
neg['target'] = 0

neg.shape

Out[45]:

(127238, 7)

In [46]:

Copied!

ctb_train_users, ctb_test_users = train_test_split(val['user_id'].unique(),
                                                  random_state=1,
                                                  test_size=0.2)
ctb_train_users, ctb_test_users = train_test_split(val['user_id'].unique(),
                                                  random_state=1,
                                                  test_size=0.2)

In [47]:

Copied!





# выделяем 10% под механизм early stopping
ctb_train_users, ctb_eval_users = train_test_split(ctb_train_users,
                                                  random_state=1,
                                                  test_size=0.1)
# выделяем 10% под механизм early stopping
ctb_train_users, ctb_eval_users = train_test_split(ctb_train_users,
                                                  random_state=1,
                                                  test_size=0.1)

In [48]:

Copied!





select_col = ['user_id', 'item_id', 'rank', 'target']

# Catboost train
ctb_train = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_train_users)],
        neg[neg['user_id'].isin(ctb_train_users)]
])[select_col]
)

# Catboost test
ctb_test = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_test_users)],
        neg[neg['user_id'].isin(ctb_test_users)]
])[select_col]
)

# for early stopping
ctb_eval = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_eval_users)],
        neg[neg['user_id'].isin(ctb_eval_users)]
])[select_col]
)
select_col = ['user_id', 'item_id', 'rank', 'target']

# Catboost train
ctb_train = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_train_users)],
        neg[neg['user_id'].isin(ctb_train_users)]
])[select_col]
)

# Catboost test
ctb_test = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_test_users)],
        neg[neg['user_id'].isin(ctb_test_users)]
])[select_col]
)

# for early stopping
ctb_eval = shuffle(
    pd.concat([
        pos[pos['user_id'].isin(ctb_eval_users)],
        neg[neg['user_id'].isin(ctb_eval_users)]
])[select_col]
)

Проверяем баланс классов¶

In [49]:

Copied!

ctb_train['target'].value_counts(normalize=True)
ctb_train['target'].value_counts(normalize=True)

Out[49]:

target
0    0.676539
1    0.323461
Name: proportion, dtype: float64

In [50]:

Copied!

ctb_test['target'].value_counts(normalize=True)
ctb_test['target'].value_counts(normalize=True)

Out[50]:

target
0    0.675174
1    0.324826
Name: proportion, dtype: float64

Формируем фичи¶

в этом ноутбуке рассмотрен бейзлайн с минимальной обработкой признаков - оставляем ее для Catboost
выбираем простые категориальные признаки, которые не нужно предобрабатывать
как признак также используем rank от модели 1 этапа
заполняем пропуски в данных

In [51]:

Copied!

user_col = ['user_id', 'age', 'income', 'sex', 'kids_flg']
item_col = ['item_id', 'content_type', 'countries', 'for_kids', 'age_rating', 'studios']
user_col = ['user_id', 'age', 'income', 'sex', 'kids_flg']
item_col = ['item_id', 'content_type', 'countries', 'for_kids', 'age_rating', 'studios']

In [52]:

Copied!





train_feat = (ctb_train
              .merge(users[user_col], on=['user_id'], how='left')
              .merge(items[item_col], on=['item_id'], how='left'))

eval_feat = (ctb_eval
             .merge(users[user_col], on=['user_id'], how='left')
             .merge(items[item_col], on=['item_id'], how='left'))
train_feat = (ctb_train
              .merge(users[user_col], on=['user_id'], how='left')
              .merge(items[item_col], on=['item_id'], how='left'))

eval_feat = (ctb_eval
             .merge(users[user_col], on=['user_id'], how='left')
             .merge(items[item_col], on=['item_id'], how='left'))

In [53]:

Copied!

train_feat.head()
train_feat.head()

Out[53]:

	user_id	item_id	rank	age	income	sex	kids_flg	content_type	countries	for_kids	age_rating	studios
0	458479	12485	23	age_55_64	income_20_40	М	0.0	film	США	NaN	18.0	NaN
1	4176	4880	1	age_35_44	income_0_20	Ж	0.0	series	Россия	NaN	18.0	NaN
2	1055166	5087	22	age_25_34	income_40_60	М	1.0	film	США	NaN	18.0	NaN
3	800395	2860	18	NaN	NaN	NaN	NaN	film	Канада	NaN	16.0	NaN
4	637341	509	30	age_25_34	income_0_20	Ж	1.0	film	Россия	0.0	18.0	NaN

X,y¶

In [54]:

Copied!

drop_col = ['user_id', 'item_id']
target_col = ['target']
cat_col = ['age', 'income', 'sex', 'content_type', 'countries', 'studios']
drop_col = ['user_id', 'item_id']
target_col = ['target']
cat_col = ['age', 'income', 'sex', 'content_type', 'countries', 'studios']

In [55]:

Copied!

X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]
X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]

In [56]:

Copied!

X_train.shape, y_train.shape, X_val.shape, y_val.shape
X_train.shape, y_train.shape, X_val.shape, y_val.shape

Out[56]:

((135296, 10), (135296, 1), (15090, 10), (15090, 1))

In [57]:

Copied!

# fillna for catboost with the most frequent value
X_train = X_train.fillna(X_train.mode().iloc[0])
# fillna for catboost with the most frequent value
X_train = X_train.fillna(X_train.mode().iloc[0])

In [58]:

Copied!

# fillna for catboost with the most frequent value
X_val = X_val.fillna(X_train.mode().iloc[0])
# fillna for catboost with the most frequent value
X_val = X_val.fillna(X_train.mode().iloc[0])

In [59]:

Copied!

X_train.head()
X_train.head()

Out[59]:

	rank	age	income	sex	kids_flg	content_type	countries	age_rating	studios
0	23	age_55_64	income_20_40	М	0.0	film	США	18.0	CBS All Access
1	1	age_35_44	income_0_20	Ж	0.0	series	Россия	18.0	CBS All Access
2	22	age_25_34	income_40_60	М	1.0	film	США	18.0	CBS All Access
3	18	age_35_44	income_20_40	М	0.0	film	Канада	16.0	CBS All Access
4	30	age_25_34	income_0_20	Ж	1.0	film	Россия	18.0	CBS All Access

Обучаем `CatBoostClassifier`¶

аналогично 1 этапу, предполагаем что оптимальные гиперпараметры уже выбраны
используем только механизм early_stopping для выбора оптимального количества деревьев на небольшой валидационной выборке

In [60]:

Copied!





# параметры для обучения
est_params = {
  'subsample': 0.9,
  'max_depth': 5,
  'n_estimators': 2000,
  'learning_rate': 0.01,
  'thread_count': 20,
  'random_state': 42,
  'verbose': 200,
}

ctb_model = CatBoostClassifier(**est_params)
# параметры для обучения
est_params = {
  'subsample': 0.9,
  'max_depth': 5,
  'n_estimators': 2000,
  'learning_rate': 0.01,
  'thread_count': 20,
  'random_state': 42,
  'verbose': 200,
}

ctb_model = CatBoostClassifier(**est_params)

In [61]:

Copied!





ctb_model.fit(X_train,
              y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=100,
              cat_features=cat_col,
              plot=True)
ctb_model.fit(X_train,
              y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=100,
              cat_features=cat_col,
              plot=True)

TBB Warning: The number of workers is currently limited to 3. The request for 19 workers is ignored. Further requests for more workers will be silently ignored until the limit changes.

0:	learn: 0.6900380	test: 0.6900920	best: 0.6900920 (0)	total: 175ms	remaining: 5m 50s
200:	learn: 0.5295179	test: 0.5325592	best: 0.5325592 (200)	total: 17.5s	remaining: 2m 36s
400:	learn: 0.5210290	test: 0.5236199	best: 0.5236199 (400)	total: 34.4s	remaining: 2m 17s
600:	learn: 0.5187233	test: 0.5211758	best: 0.5211758 (600)	total: 51.9s	remaining: 2m
800:	learn: 0.5173139	test: 0.5197243	best: 0.5197243 (800)	total: 1m 8s	remaining: 1m 43s
1000:	learn: 0.5160030	test: 0.5183970	best: 0.5183970 (1000)	total: 1m 26s	remaining: 1m 26s
1200:	learn: 0.5148874	test: 0.5173102	best: 0.5173096 (1198)	total: 1m 43s	remaining: 1m 9s
1400:	learn: 0.5139265	test: 0.5164289	best: 0.5164289 (1400)	total: 2m 1s	remaining: 52s
1600:	learn: 0.5133713	test: 0.5159957	best: 0.5159953 (1598)	total: 2m 18s	remaining: 34.6s
1800:	learn: 0.5127885	test: 0.5155605	best: 0.5155605 (1800)	total: 2m 36s	remaining: 17.3s
1999:	learn: 0.5123560	test: 0.5152680	best: 0.5152680 (1999)	total: 2m 54s	remaining: 0us

bestTest = 0.515267964
bestIteration = 1999

Out[61]:

<catboost.core.CatBoostClassifier at 0x7ca5a4b9e050>

Посмотрим на feature importance¶

In [62]:

Copied!

imp_catboost = pd.DataFrame(X_train.columns, columns = ['feature'])
imp_catboost['importance'] = ctb_model.feature_importances_
imp_catboost = pd.DataFrame(X_train.columns, columns = ['feature'])
imp_catboost['importance'] = ctb_model.feature_importances_

In [63]:

Copied!

sns.barplot(data=imp_catboost.sort_values(by='importance', ascending=False), x='importance', y='feature', palette="BuGn_r")
plt.title('Top feature importances');
sns.barplot(data=imp_catboost.sort_values(by='importance', ascending=False), x='importance', y='feature', palette="BuGn_r")
plt.title('Top feature importances');

Оценим метрики catboost модели¶

In [64]:

Copied!

test_feat = (ctb_test
             .merge(users[user_col], on=['user_id'], how='left')
             .merge(items[item_col], on=['item_id'], how='left'))
test_feat = (ctb_test
             .merge(users[user_col], on=['user_id'], how='left')
             .merge(items[item_col], on=['item_id'], how='left'))

In [65]:

Copied!

# fillna for catboost with the most frequent value
test_feat = test_feat.fillna(X_train.mode().iloc[0])
# fillna for catboost with the most frequent value
test_feat = test_feat.fillna(X_train.mode().iloc[0])

In [66]:

Copied!

X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']
X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']

In [67]:

Copied!

X_test.head()
X_test.head()

Out[67]:

	rank	age	income	sex	kids_flg	content_type	countries	age_rating	studios
0	2	age_35_44	income_20_40	М	1.0	film	Канада	18.0	CBS All Access
1	7	age_35_44	income_20_40	М	0.0	film	США, Великобритания, Франция	16.0	CBS All Access
2	29	age_45_54	income_40_60	М	1.0	film	Россия	18.0	CBS All Access
3	6	age_18_24	income_40_60	М	0.0	film	Россия	16.0	CBS All Access
4	7	age_18_24	income_20_40	Ж	0.0	film	США	12.0	CBS All Access

In [68]:

Copied!

y_pred = ctb_model.predict_proba(X_test)
y_pred = ctb_model.predict_proba(X_test)

In [69]:

Copied!

f"ROC AUC score = {roc_auc_score(y_test, y_pred[:, 1]):.2f}"
f"ROC AUC score = {roc_auc_score(y_test, y_pred[:, 1]):.2f}"

Out[69]:

'ROC AUC score = 0.77'

Метрики качества на глобальном test¶

Формируем рекомендации отдельно моделью 1 этапа - bpr
Отдельно двухэтапной моделью bpr + Catboost
Сравниваем

1) Метрики качества bpr¶

In [70]:

Copied!

# оставляем только теплых пользователей
test = test[test['user_id'].isin(val['user_id'].unique())]
# оставляем только теплых пользователей
test = test[test['user_id'].isin(val['user_id'].unique())]

In [71]:

Copied!

test_user_history = test.groupby('user_id')[['item_id']].agg(lambda x: list(x))
pred_bpr = predict(user_vecs, item_vecs, k=100)
pred_bpr = test_user_history.merge(pred_bpr, how='left', on='user_id')
test_user_history = test.groupby('user_id')[['item_id']].agg(lambda x: list(x))
pred_bpr = predict(user_vecs, item_vecs, k=100)
pred_bpr = test_user_history.merge(pred_bpr, how='left', on='user_id')

In [72]:

Copied!

recall(pred_bpr, k=20)
recall(pred_bpr, k=20)

Out[72]:

0.042217550721510705

In [73]:

Copied!

precision(pred_bpr, k=20)
precision(pred_bpr, k=20)

Out[73]:

0.04225740023000195

In [74]:

Copied!

mrr(pred_bpr, k=20)
mrr(pred_bpr, k=20)

Out[74]:

0.021039760800473414

2) Метрики качества двухэтапной модели: `bpr` + `Catboost`¶

In [75]:

Copied!

# генерируем предказания

pred_bpr = pred_bpr[['user_id', 'preds']]
pred_bpr = pred_bpr.explode('preds').rename(columns={'preds': 'item_id'})
pred_bpr['rank'] = pred_bpr.groupby('user_id').cumcount() + 1

pred_bpr.head()
# генерируем предказания

pred_bpr = pred_bpr[['user_id', 'preds']]
pred_bpr = pred_bpr.explode('preds').rename(columns={'preds': 'item_id'})
pred_bpr['rank'] = pred_bpr.groupby('user_id').cumcount() + 1

pred_bpr.head()

Out[75]:

user_id	item_id	rank
21	849	1
21	11237	2
21	24	3
21	1053	4
21	7713	5

In [76]:

Copied!





pred_bpr_ctb = pred_bpr.copy()

# фичи для теста
score_feat = (pred_bpr_ctb
              .merge(users[user_col], on=['user_id'], how='left')
              .merge(items[item_col], on=['item_id'], how='left'))

# fillna for catboost with the most frequent value
score_feat = score_feat.fillna(X_train.mode().iloc[0])
pred_bpr_ctb = pred_bpr.copy()

# фичи для теста
score_feat = (pred_bpr_ctb
              .merge(users[user_col], on=['user_id'], how='left')
              .merge(items[item_col], on=['item_id'], how='left'))

# fillna for catboost with the most frequent value
score_feat = score_feat.fillna(X_train.mode().iloc[0])

In [77]:

Copied!

score_feat.head()
score_feat.head()

Out[77]:

	user_id	item_id	rank	age	income	sex	content_type	countries	age_rating	studios
0	21	849	1	age_45_54	income_20_40	Ж	film	США	18.0	CBS All Access
1	21	11237	2	age_45_54	income_20_40	Ж	film	Россия	16.0	CBS All Access
2	21	24	3	age_45_54	income_20_40	Ж	series	Германия	16.0	CBS All Access
3	21	1053	4	age_45_54	income_20_40	Ж	film	США	18.0	CBS All Access
4	21	7713	5	age_45_54	income_20_40	Ж	film	Россия	16.0	CBS All Access

In [78]:

Copied!

# catboost predict_proba
ctb_prediction = ctb_model.predict_proba(score_feat.drop(drop_col, axis=1, errors='ignore'))

pred_bpr_ctb['ctb_pred'] = ctb_prediction[:, 1]
pred_bpr_ctb.head(3)
# catboost predict_proba
ctb_prediction = ctb_model.predict_proba(score_feat.drop(drop_col, axis=1, errors='ignore'))

pred_bpr_ctb['ctb_pred'] = ctb_prediction[:, 1]
pred_bpr_ctb.head(3)

Out[78]:

user_id	item_id	rank	ctb_pred
21	849	1	0.238199
21	11237	2	0.423691
21	24	3	0.182443

Проставляем итоговый ранк двухэтапной модели¶

In [79]:

Copied!





# сортируем по скору внутри одного пользователя и проставляем новый ранг
pred_bpr_ctb = pred_bpr_ctb.sort_values(
    by=['user_id', 'ctb_pred'], ascending=[True, False])
pred_bpr_ctb['rank_ctb'] = pred_bpr_ctb.groupby('user_id').cumcount() + 1
# сортируем по скору внутри одного пользователя и проставляем новый ранг
pred_bpr_ctb = pred_bpr_ctb.sort_values(
    by=['user_id', 'ctb_pred'], ascending=[True, False])
pred_bpr_ctb['rank_ctb'] = pred_bpr_ctb.groupby('user_id').cumcount() + 1

In [80]:

Copied!

pred_bpr_ctb.head(10)
pred_bpr_ctb.head(10)

Out[80]:

user_id	item_id	rank	ctb_pred	rank_ctb
21	4880	38	0.739261	1
21	13936	6	0.515807	2
21	7713	5	0.442528	3
21	11237	2	0.423691	4
21	7417	7	0.380741	5
21	12058	9	0.307638	6
21	496	10	0.290765	7
21	12701	45	0.269757	8
21	8252	67	0.269757	9
21	8636	11	0.258652	10

`1+2 этапы bpr + Сatboost`: Метрики¶

In [81]:

Copied!

true_items = test.groupby('user_id').agg(lambda x: list(x))[['item_id']].reset_index()
pred_items = pred_bpr_ctb.groupby('user_id').agg(lambda x: list(x))[['item_id']].reset_index().rename(columns={'item_id': 'preds'})
true_pred_items = true_items.merge(pred_items, how='left')
true_items = test.groupby('user_id').agg(lambda x: list(x))[['item_id']].reset_index()
pred_items = pred_bpr_ctb.groupby('user_id').agg(lambda x: list(x))[['item_id']].reset_index().rename(columns={'item_id': 'preds'})
true_pred_items = true_items.merge(pred_items, how='left')

In [82]:

Copied!

recall(true_pred_items, k=20)
recall(true_pred_items, k=20)

Out[82]:

0.05371511488902361

In [83]:

Copied!

precision(true_pred_items, k=20)
precision(true_pred_items, k=20)

Out[83]:

0.0537566192633558

In [84]:

Copied!

mrr(true_pred_items, k=20)
mrr(true_pred_items, k=20)

Out[84]:

0.03348700850344106