Dssm simple
!pip install -q replay-rec rs_datasets --quiet
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.8/196.8 kB 2.2 MB/s eta 0:00:0000:010:01 Preparing metadata (setup.py) ... done Installing build dependencies ... done Getting requirements to build wheel ... done Preparing metadata (pyproject.toml) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.9/235.9 kB 4.8 MB/s eta 0:00:0000:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 390.6/390.6 kB 7.4 MB/s eta 0:00:00a 0:00:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 31.0/31.0 MB 49.5 MB/s eta 0:00:00:00:0100:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 290.5/290.5 kB 18.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 82.0/82.0 MB 19.9 MB/s eta 0:00:00:00:0100:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.9/67.9 kB 4.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 80.1 MB/s eta 0:00:00:00:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 93.3/93.3 kB 6.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.6/49.6 kB 3.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 139.0/139.0 kB 8.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 413.7/413.7 kB 26.2 MB/s eta 0:00:00 Building wheel for fixed-install-nmslib (setup.py) ... done Building wheel for hnswlib (pyproject.toml) ... done
from datetime import datetime as dt
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from rs_datasets import MovieLens
from replay.splitters.time_splitter import TimeSplitter
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.metrics import NDCG, HitRate, Coverage, Experiment
from dataclasses import dataclass
from dataclasses import dataclass
@dataclass
class config:
USER_COL : str = 'user_id'
ITEM_COL : str = 'item_id'
RATING_COL : str = 'rating'
TIMESTAMP : str = 'timestamp'
NUM_EPOCHS : int = 30
K = 10
SEED = 123
config = config()
random.seed(config.SEED)
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)
Background¶
In this notebook we will look at how to use a neural network approach to recommendations
- Implicit feedback will be used
- Scalar product of both the
user_id
anditem_id
embeddings will be our relevancy scores - User film interactions will be
positive
feedback & negative samples which will be created randomly are ournegative
samples - The dataset is split into two,
train
will be used to train a model on historical user data,test
will be used to provide user recommendations - What we will be telling the model is to learn and differentiate between
1 | Load Dataset¶
We will be using a simplified dataset MovieLens
with 100,000 interactions
class MovieLensPrepare:
def __init__(self):
rs = MovieLens('1m')
self.data = rs.ratings
self.u_features = rs.users
self.i_features = rs.items
def preprocess(self):
data = self.data
u_features = self.u_features
i_features = self.i_features
data = MinCountFilter(num_entries=20).transform(data)
# interactions and user & item features must be synchronised
data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]
print(f"Number of unique users {data['user_id'].nunique()}")
print(f"Number of unique items {data['item_id'].nunique()}")
# interactions and user & item features must be synchronised
data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]
data[config.TIMESTAMP] = pd.to_datetime(data['timestamp'],unit='s')
self.data = data
def split_data(self):
data = self.data
u_features = self.u_features
i_features = self.i_features
splitter = TimeSplitter(time_threshold=0.2, # 20% into test subset
drop_cold_users=True,
drop_cold_items=True,
query_column=config.USER_COL)
train,test = splitter.split(data)
print('train size',train.shape[0])
print('test size', test.shape[0])
# user features and item features must be present in interactions dataset and only
u_features = u_features[u_features[config.USER_COL].isin(train[config.USER_COL].unique())]
i_features = i_features[i_features[config.ITEM_COL].isin(train[config.ITEM_COL].unique())]
# encoders for users
encoder_user = LabelEncoder()
encoder_user.fit(train[config.USER_COL])
# encoders for items
encoder_item = LabelEncoder()
encoder_item.fit(train[config.ITEM_COL])
train[config.USER_COL] = encoder_user.transform(train[config.USER_COL])
train[config.ITEM_COL] = encoder_item.transform(train[config.ITEM_COL])
test[config.USER_COL] = encoder_user.transform(test[config.USER_COL])
test[config.ITEM_COL] = encoder_item.transform(test[config.ITEM_COL])
u_features[config.USER_COL] = encoder_user.transform(u_features[config.USER_COL])
i_features[config.ITEM_COL] = encoder_item.transform(i_features[config.ITEM_COL])
self.train = train
self.test = test
self.u_features = u_features
self.i_features = i_features
def filter_test(self):
filter_rating = LowRatingFilter(value=4)
self.test = filter_rating.transform(self.test)
study = MovieLensPrepare()
5.93MB [00:00, 10.8MB/s]
Extract the relevant subsets of data; interactions, user features & item features. We should also note that the timestamp
feature is in unix seconds time, which we'll need to convert to datetime
later
2 | Preprocessing¶
Replay
contains a handy & quick way for preprocessing interactionsMinCountFilter
can be used for filtering our interactions that have less than num_entries- Lets use this method for removing user interactions with less than 20 items
study.preprocess()
Number of unique users 6040 Number of unique items 3706
Lets check the user_id
and item_id
statistics after our filtration and make sure our interactions are synchronised with both the user and item feature subsets
Now let's convert the time feature to datetime, so we can more easily interpret how to split the dataset in time.
In our problem, we assume that the test interactions have not been made yet
3 | Splitting Dataset in time¶
- The next step after preprocessing the dataset to our liking is to split it into subsets, so we can train the model on one subset and use another for model validation (20%)
- replay has a function named
TimeSplitter
, which we will to create our subsets
class TimeSplitter(replay.splitters.base_splitter.Splitter) | TimeSplitter(time_threshold: Union[datetime.datetime, str, float], query_column: str = 'query_id', drop_cold_users: bool = False, drop_cold_items: bool = False, item_column: str = 'item_id', timestamp_column: str = 'timestamp', session_id_column: Optional[str] = None, session_id_processing_strategy: str = 'test', time_column_format: str = '%Y-%m-%d %H:%M:%S')
study.split_data()
train size 800164 test size 104452
<ipython-input-4-142900e37db3>:66: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy u_features[config.USER_COL] = encoder_user.transform(u_features[config.USER_COL]) <ipython-input-4-142900e37db3>:67: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy i_features[config.ITEM_COL] = encoder_item.transform(i_features[config.ITEM_COL])
4 | Rating Filter¶
We want to recommend only items that have been rated highly, so for the test
subset, we will be using LowRatingFilter
to remove iteractions with low ratings
study.filter_test()
5 | Create Torch Dataset¶
We need to create a torch dataset from our matrix of interactions data
- The dataset
TowerTrain
get_item
for each index inputs theuser_id
anditem_id
(which will be our positive feedback) from the interaction dataset : (positive_item_id) - Additionally for this user
user_id
, we generate an additional number of randomitem_id
which will be the negative samples, which the user hasn't watched - Both of these are concatenated into a single array vector (items)
- Lastly we also return the labels, corresponding to either the positive (1) or negative (0) sample id
from torch.utils.data import Dataset, DataLoader
class TowerTrain(Dataset):
def __init__(self,
data,
num_negatives=10,
i_features=None,
u_features=None):
# user, item
self.data = data[[config.USER_COL,config.ITEM_COL]].to_numpy()
self.num_negatives = num_negatives
self.num_items = len(np.unique(self.data[:, 1]))
self.i_features = i_features
self.u_features = u_features
def __len__(self):
return len(self.data)
# get item of row in data
def __getitem__(self, idx):
# index to -> user_id, item_id
user_id, positive_item_id = self.data[idx, 0], self.data[idx, 1]
# create positive, negative samples
# torch tensor for each item_id (pos sample) create 10 neg samples
items = torch.tensor(np.hstack([positive_item_id,
np.random.randint(
low=0,
high=self.num_items,
size=self.num_negatives)]),
dtype=torch.int32)
# set all labels to 0
labels = torch.zeros(self.num_negatives + 1, dtype=torch.float32)
labels[0] = 1. # positive label
return {'user_ids': torch.tensor([user_id], dtype=torch.int32),
'item_ids': items,
'labels': labels}
We create the dataset and dataloaders containing a batch size of 1024 rows, we'll define the training dataloader and show a batch sample output batch
, which is the result of the __forward__
method pass for the number of items in the batch
# create dataset
ds_train = TowerTrain(study.train)
# create data loader
dl_train = DataLoader(ds_train,
batch_size=2,
shuffle=True,
num_workers=0)
batch = next(iter(dl_train))
batch
{'user_ids': tensor([[ 163], [3579]], dtype=torch.int32), 'item_ids': tensor([[1036, 3582, 3437, 3454, 1346, 1122, 1766, 3089, 2154, 1147, 1593], [1806, 3286, 1761, 96, 2161, 2686, 47, 73, 1568, 942, 2272]], dtype=torch.int32), 'labels': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])}
7 | Model Definition¶
We will be creating a subclass SimpleTower
, which only includes the embeddings of both user_id
and item_id
- The
forward
method, when called simply returns the user/item row of the corresponding embedding matrix - Calculates the dot product between the
user_id
&item_id
matrices
# subclass contains only embedding layer but we can
# expand on this by importing user, item features
class SimpleTower(nn.Module):
def __init__(self, num_embeddings, emb_dim):
super().__init__()
self.emb = nn.Embedding(num_embeddings, emb_dim)
def forward(self, ids, features=None):
return self.emb(ids)
class BaseTwoHead(nn.Module):
def __init__(self,
emb_dim,
user_config=None,
item_config=None):
super().__init__()
self.emb_dim = emb_dim
self.user_tower = SimpleTower(emb_dim=emb_dim, **user_config) # (emb_dim,n_users)
self.item_tower = SimpleTower(emb_dim=emb_dim, **item_config) # (emb_dim,n_items)
# forward method defines two 'towers'
# and the scalar product of the two
# which will gives us the scores
def forward(self, batch):
item_emb = self.item_tower(batch["item_ids"]) # (batch,1,16)
user_emb = self.user_tower(batch["user_ids"]) # (batch,11,16)
dot_product = (user_emb * item_emb).sum(dim=-1) # (batch,11)
return dot_product
# methods for extracting embeddings
def infer_users(self, batch):
return self.user_tower(batch["user_ids"])
def infer_items(self, batch):
return self.item_tower(batch["item_ids"])
Let's check the contents of the model, if we utilise an embedding size of 16
# model parameters
embed_config = {'emb_dim' : 16} # embedding dimension
user_config = {'num_embeddings' : study.train[config.USER_COL].max() + 1,} # number of users
item_config = {'num_embeddings' : study.train[config.ITEM_COL].max() + 1,} # number of items
# import the embedding dimension
model = BaseTwoHead(**embed_config,
user_config=user_config,
item_config=item_config)
model
BaseTwoHead( (user_tower): SimpleTower( (emb): Embedding(5400, 16) ) (item_tower): SimpleTower( (emb): Embedding(3662, 16) ) )
Model forward
pass
- The output of the model will give us the logits for each of the 11 items, for each user row
# output for a single batch
output = model(batch)
output
tensor([[ 1.6632, 5.8888, 0.0997, 7.6885, 8.2156, 4.0495, 3.0272, 1.9775, -1.8750, 4.3952, 0.2714], [ 5.3873, -10.4797, -4.2230, -0.4488, 0.9215, -5.0823, -0.5018, 4.9579, 0.8251, -6.3608, -4.5723]], grad_fn=<SumBackward1>)
Output size
output.size()
torch.Size([2, 11])
8 | Extracting Embeddings¶
We can extract the embeddings for both the user and items using the following method and use it for inference
. In this type of simple model, it won't be such a big problem to extract it anyway. Below is an example for a batch size of two
# extract embeddings from model (here items)
i_embeddings = model.infer_items(batch)
i_embeddings.size()
torch.Size([2, 11, 16])
u_embeddings = model.infer_users(batch)
u_embeddings.size()
torch.Size([2, 1, 16])
9 | Training the network¶
We need to define an optimiser
, loss function
and the datasets in the form of a data loader
. As we are setting up the problem as a binary classification problem, we'll be using BCEWithLogitsLoss
optimizer = torch.optim.Adam(model.parameters(),
lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()
# create train dataset
ds_train = TowerTrain(study.train)
# create train data loader
dl_train = DataLoader(ds_train,
batch_size=1024,
shuffle=True,
num_workers=0)
# create test dataset
ds_test = TowerTrain(study.test)
# create test data loader
dl_test = DataLoader(ds_test,
batch_size=1024,
shuffle=True,
num_workers=0)
train_loss_per_epoch = []
test_loss_per_epoch = []
# loop through all epochs
for epoch in tqdm(range(config.NUM_EPOCHS)):
# training loop for all batches
model.train()
train_loss = 0.0
for iteration, batch in enumerate(dl_train):
optimizer.zero_grad()
preds = model(batch)
loss = loss_fn(preds, batch['labels'])
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(dl_train)
# evaluation loop for all batches
model.eval()
test_loss = 0
for iteration, batch in enumerate(dl_test):
preds = model(batch)
loss = loss_fn(preds, batch['labels'])
test_loss += loss.item()
# evaluation of loss
test_loss /= len(dl_test)
test_loss_per_epoch.append(test_loss)
train_loss_per_epoch.append(train_loss)
0%| | 0/30 [00:00<?, ?it/s]
Save our trained model for later use!
# save our model state
torch.save(model.state_dict(), f"/content/model_{config.NUM_EPOCHS}")
8 | Generating user recommendations¶
Time has come to use our trained model!
- We will be making recommendations by using the model that we trained on the train dataset and using the test users to make predictions
- To make predictions, we will extract the embedding matrix weights for user and items, calculate the scores, get the top k results for each user based on the largest score values
8.1. Load Weights¶
First things first, we need to load the model weights
import warnings; warnings.filterwarnings('ignore')
model = BaseTwoHead(**embed_config,
user_config=user_config,
item_config=item_config)
model.load_state_dict(torch.load(f"/content/model_{config.NUM_EPOCHS}"))
model.eval()
BaseTwoHead( (user_tower): SimpleTower( (emb): Embedding(5400, 16) ) (item_tower): SimpleTower( (emb): Embedding(3662, 16) ) )
8.2. Get test users¶
Get the user identifiers that are in the test test, the test set was saved in study.test
test_users = study.test[[config.USER_COL]].drop_duplicates().reset_index(drop=True)
test_users.head()
user_id | |
---|---|
0 | 1238 |
1 | 0 |
2 | 2146 |
3 | 1380 |
4 | 3180 |
8.3. Extract Weights¶
Extract the embedding weights for all users and items which is located in the model
# extract the user / item embedding weights
user_embed = model.user_tower.emb.weight.detach().cpu().numpy()
item_embed = model.item_tower.emb.weight.detach().cpu().numpy()
user_embed.shape, item_embed.shape
((5400, 16), (3662, 16))
8.4. Scalar product¶
Calculate the scores for each user & item combination by calculating the scalar product of them
# calcualate the scores (751,1616)
scores = user_embed[test_users[config.USER_COL].values] @ item_embed.T
print(scores)
[[-8.98283534e-03 -2.23088098e+00 -1.21248198e+00 ... -5.93518734e+00 -6.74669886e+00 -3.57267737e+00] [ 1.21680886e-01 -1.92409587e+00 -2.67904949e+00 ... -5.99316978e+00 -3.68449354e+00 -1.20745528e+00] [-3.68786573e-01 -3.50460958e+00 -3.39616084e+00 ... -4.76325846e+00 -2.48652792e+00 -9.43809271e-01] ... [ 5.35706937e-01 -3.02921438e+00 -2.88318610e+00 ... -3.14549780e+00 -3.66571522e+00 -1.52589762e+00] [-6.87072992e-01 -1.82784998e+00 -2.26169515e+00 ... -6.18041801e+00 -5.24198675e+00 -3.18532085e+00] [-7.03839183e-01 -1.79352736e+00 -2.59766245e+00 ... -9.07467270e+00 -7.98052073e+00 -4.78053665e+00]]
8.5. Get highest scores¶
Get the highest value indicies (idx) & their corresponding values (scores). The scores correspond to the index of the item in the encoder encoder_item
, which we stored in class instance study
# get top 10 idx by value & get its value
ids = np.argpartition(scores, -config.K)[:, -config.K:]
scores = np.take_along_axis(scores, ids, axis=1)
scores[:5]
array([[ 0.37186688, 0.3992171 , 0.4394618 , 0.4884671 , 0.5174091 , 0.87457657, 0.6083931 , 0.58975196, 0.8642359 , 0.78722566], [ 0.4257152 , 0.4830652 , 0.50204533, 0.50319844, 0.6602011 , 0.8257402 , 0.8278695 , 0.69933504, 0.88355327, 0.8429366 ], [ 0.9651828 , 0.9905893 , 1.058987 , 1.0769 , 1.2267288 , 1.4366896 , 1.284672 , 1.255043 , 1.2931157 , 1.2413594 ], [-0.11466098, -0.10546018, -0.1026658 , -0.09919392, -0.0560105 , -0.03253222, 0.02805819, -0.07913139, 0.00199754, -0.0307106 ], [ 0.5338577 , 0.558586 , 0.6305356 , 0.6141302 , 0.632113 , 0.7719943 , 0.6591141 , 0.8380035 , 0.65727115, 0.7047775 ]], dtype=float32)
8.6. Recommendations Matrix¶
Prepare the usual format, user_id
, item_id
and rating rating
, which will enable us to quickly evaluate the metrics using experiment
function from replay. We need to add both lists to each user & expand them together
# prepare recommendations matrix
def prepare_recs(test_users,
rec_item_ids,
rec_relevances):
predict = test_users.copy()
predict[config.ITEM_COL] = rec_item_ids.tolist() # add list of indicies for each user
predict['rating'] = rec_relevances.tolist() # add rating list of scores for each user
predict = predict.explode(column=[config.ITEM_COL, 'rating']).reset_index(drop=True) # expand both lists
predict[config.ITEM_COL] = predict[config.ITEM_COL].astype(int)
predict['rating'] = predict['rating'].astype("double")
return predict
model_recommendations = prepare_recs(test_users, # user columns
rec_item_ids=ids, # indicies of top 10 in scores
rec_relevances=scores) # scores of top 10
model_recommendations
user_id | item_id | rating | |
---|---|---|---|
0 | 1238 | 2133 | 0.371867 |
1 | 1238 | 1198 | 0.399217 |
2 | 1238 | 346 | 0.439462 |
3 | 1238 | 1156 | 0.488467 |
4 | 1238 | 2620 | 0.517409 |
... | ... | ... | ... |
11215 | 5309 | 1115 | 0.197690 |
11216 | 5309 | 2130 | 0.171694 |
11217 | 5309 | 2480 | 0.256298 |
11218 | 5309 | 1505 | 0.161175 |
11219 | 5309 | 1795 | 0.279239 |
11220 rows × 3 columns
We'll evaluate the prediction & test overlapping items using hitrate, to measure how well the model predicts at least one relevant recommendation for users. NDCG, for the evaluation of how well the model can correcly order the relevant items & coverage to measure how well the model predicts a range of items from all available items
metrics = Experiment(
[NDCG(config.K), HitRate(config.K), Coverage(config.K)],
study.test,
study.train,
query_column=config.USER_COL,
item_column=config.ITEM_COL,
)
metrics.add_result("dssm_model", model_recommendations)
metrics.results
NDCG@10 | HitRate@10 | Coverage@10 | |
---|---|---|---|
dssm_model | 0.058016 | 0.317291 | 0.231294 |