Dssm simple

In [1]:

Copied!

!pip install -q replay-rec rs_datasets --quiet
!pip install -q replay-rec rs_datasets --quiet

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.8/196.8 kB 2.2 MB/s eta 0:00:0000:010:01
  Preparing metadata (setup.py) ... done
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.9/235.9 kB 4.8 MB/s eta 0:00:0000:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 390.6/390.6 kB 7.4 MB/s eta 0:00:00a 0:00:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 31.0/31.0 MB 49.5 MB/s eta 0:00:00:00:0100:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 290.5/290.5 kB 18.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 82.0/82.0 MB 19.9 MB/s eta 0:00:00:00:0100:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.9/67.9 kB 4.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 80.1 MB/s eta 0:00:00:00:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 93.3/93.3 kB 6.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.6/49.6 kB 3.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 139.0/139.0 kB 8.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 413.7/413.7 kB 26.2 MB/s eta 0:00:00
  Building wheel for fixed-install-nmslib (setup.py) ... done
  Building wheel for hnswlib (pyproject.toml) ... done

In [2]:

Copied!





from datetime import datetime as dt
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

from rs_datasets import MovieLens
from replay.splitters.time_splitter import TimeSplitter
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.metrics import NDCG, HitRate, Coverage, Experiment
from dataclasses import dataclass
from datetime import datetime as dt
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

from rs_datasets import MovieLens
from replay.splitters.time_splitter import TimeSplitter
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.metrics import NDCG, HitRate, Coverage, Experiment
from dataclasses import dataclass

In [3]:

Copied!





from dataclasses import dataclass

@dataclass
class config:

    USER_COL : str = 'user_id'
    ITEM_COL : str = 'item_id'
    RATING_COL : str = 'rating'
    TIMESTAMP : str = 'timestamp'
    NUM_EPOCHS : int = 30

    K = 10
    SEED = 123

config = config()
random.seed(config.SEED)
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)
from dataclasses import dataclass

@dataclass
class config:

    USER_COL : str = 'user_id'
    ITEM_COL : str = 'item_id'
    RATING_COL : str = 'rating'
    TIMESTAMP : str = 'timestamp'
    NUM_EPOCHS : int = 30

    K = 10
    SEED = 123

config = config()
random.seed(config.SEED)
torch.manual_seed(config.SEED)
np.random.seed(config.SEED)

Background¶

In this notebook we will look at how to use a neural network approach to recommendations

Implicit feedback will be used
Scalar product of both the user_id and item_id embeddings will be our relevancy scores
User film interactions will be positive feedback & negative samples which will be created randomly are our negative samples
The dataset is split into two, train will be used to train a model on historical user data, test will be used to provide user recommendations
What we will be telling the model is to learn and differentiate between

1 | Load Dataset¶

We will be using a simplified dataset MovieLens with 100,000 interactions

In [4]:

Copied!





class MovieLensPrepare:

    def __init__(self):
        rs = MovieLens('1m')
        self.data = rs.ratings
        self.u_features = rs.users
        self.i_features = rs.items


    def preprocess(self):

        data = self.data
        u_features = self.u_features
        i_features = self.i_features
        
        data = MinCountFilter(num_entries=20).transform(data)

        # interactions and user & item features must be synchronised
        data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
        data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]

        print(f"Number of unique users {data['user_id'].nunique()}")
        print(f"Number of unique items {data['item_id'].nunique()}")

        # interactions and user & item features must be synchronised
        data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
        data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]

        data[config.TIMESTAMP] = pd.to_datetime(data['timestamp'],unit='s')

        self.data = data

    def split_data(self):

        data = self.data
        u_features = self.u_features
        i_features = self.i_features

        splitter = TimeSplitter(time_threshold=0.2,  # 20% into test subset
                            drop_cold_users=True,
                            drop_cold_items=True,
                            query_column=config.USER_COL)
        
        train,test = splitter.split(data)
        print('train size',train.shape[0])
        print('test size', test.shape[0])

        # user features and item features must be present in interactions dataset and only
        u_features = u_features[u_features[config.USER_COL].isin(train[config.USER_COL].unique())]
        i_features = i_features[i_features[config.ITEM_COL].isin(train[config.ITEM_COL].unique())]

        # encoders for users
        encoder_user = LabelEncoder()
        encoder_user.fit(train[config.USER_COL])
        
        # encoders for items
        encoder_item = LabelEncoder()
        encoder_item.fit(train[config.ITEM_COL])

        train[config.USER_COL] = encoder_user.transform(train[config.USER_COL])
        train[config.ITEM_COL] = encoder_item.transform(train[config.ITEM_COL])
        
        test[config.USER_COL] = encoder_user.transform(test[config.USER_COL])
        test[config.ITEM_COL] = encoder_item.transform(test[config.ITEM_COL])
        
        u_features[config.USER_COL] = encoder_user.transform(u_features[config.USER_COL])
        i_features[config.ITEM_COL] = encoder_item.transform(i_features[config.ITEM_COL])

        self.train = train 
        self.test = test

        self.u_features = u_features
        self.i_features = i_features
        
    def filter_test(self):
        filter_rating = LowRatingFilter(value=4)        
        self.test = filter_rating.transform(self.test)
        
study = MovieLensPrepare()
class MovieLensPrepare:

    def __init__(self):
        rs = MovieLens('1m')
        self.data = rs.ratings
        self.u_features = rs.users
        self.i_features = rs.items


    def preprocess(self):

        data = self.data
        u_features = self.u_features
        i_features = self.i_features
        
        data = MinCountFilter(num_entries=20).transform(data)

        # interactions and user & item features must be synchronised
        data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
        data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]

        print(f"Number of unique users {data['user_id'].nunique()}")
        print(f"Number of unique items {data['item_id'].nunique()}")

        # interactions and user & item features must be synchronised
        data = data[data[config.USER_COL].isin(u_features[config.USER_COL].unique())]
        data = data[data[config.ITEM_COL].isin(i_features[config.ITEM_COL].unique())]

        data[config.TIMESTAMP] = pd.to_datetime(data['timestamp'],unit='s')

        self.data = data

    def split_data(self):

        data = self.data
        u_features = self.u_features
        i_features = self.i_features

        splitter = TimeSplitter(time_threshold=0.2,  # 20% into test subset
                            drop_cold_users=True,
                            drop_cold_items=True,
                            query_column=config.USER_COL)
        
        train,test = splitter.split(data)
        print('train size',train.shape[0])
        print('test size', test.shape[0])

        # user features and item features must be present in interactions dataset and only
        u_features = u_features[u_features[config.USER_COL].isin(train[config.USER_COL].unique())]
        i_features = i_features[i_features[config.ITEM_COL].isin(train[config.ITEM_COL].unique())]

        # encoders for users
        encoder_user = LabelEncoder()
        encoder_user.fit(train[config.USER_COL])
        
        # encoders for items
        encoder_item = LabelEncoder()
        encoder_item.fit(train[config.ITEM_COL])

        train[config.USER_COL] = encoder_user.transform(train[config.USER_COL])
        train[config.ITEM_COL] = encoder_item.transform(train[config.ITEM_COL])
        
        test[config.USER_COL] = encoder_user.transform(test[config.USER_COL])
        test[config.ITEM_COL] = encoder_item.transform(test[config.ITEM_COL])
        
        u_features[config.USER_COL] = encoder_user.transform(u_features[config.USER_COL])
        i_features[config.ITEM_COL] = encoder_item.transform(i_features[config.ITEM_COL])

        self.train = train 
        self.test = test

        self.u_features = u_features
        self.i_features = i_features
        
    def filter_test(self):
        filter_rating = LowRatingFilter(value=4)        
        self.test = filter_rating.transform(self.test)
        
study = MovieLensPrepare()

5.93MB [00:00, 10.8MB/s]

Extract the relevant subsets of data; interactions, user features & item features. We should also note that the timestamp feature is in unix seconds time, which we'll need to convert to datetime later

2 | Preprocessing¶

Replay contains a handy & quick way for preprocessing interactions
MinCountFilter can be used for filtering our interactions that have less than num_entries
Lets use this method for removing user interactions with less than 20 items

In [5]:

Copied!

study.preprocess()
study.preprocess()

Number of unique users 6040
Number of unique items 3706

Lets check the user_id and item_id statistics after our filtration and make sure our interactions are synchronised with both the user and item feature subsets

Now let's convert the time feature to datetime, so we can more easily interpret how to split the dataset in time.

In our problem, we assume that the test interactions have not been made yet

3 | Splitting Dataset in time¶

The next step after preprocessing the dataset to our liking is to split it into subsets, so we can train the model on one subset and use another for model validation (20%)
replay has a function named TimeSplitter, which we will to create our subsets

class TimeSplitter(replay.splitters.base_splitter.Splitter) | TimeSplitter(time_threshold: Union[datetime.datetime, str, float], query_column: str = 'query_id', drop_cold_users: bool = False, drop_cold_items: bool = False, item_column: str = 'item_id', timestamp_column: str = 'timestamp', session_id_column: Optional[str] = None, session_id_processing_strategy: str = 'test', time_column_format: str = '%Y-%m-%d %H:%M:%S')

In [6]:

Copied!

study.split_data()
study.split_data()

train size 800164
test size 104452

<ipython-input-4-142900e37db3>:66: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  u_features[config.USER_COL] = encoder_user.transform(u_features[config.USER_COL])
<ipython-input-4-142900e37db3>:67: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_features[config.ITEM_COL] = encoder_item.transform(i_features[config.ITEM_COL])

4 | Rating Filter¶

We want to recommend only items that have been rated highly, so for the test subset, we will be using LowRatingFilter to remove iteractions with low ratings

In [7]:

Copied!

study.filter_test()
study.filter_test()

5 | Create Torch Dataset¶

We need to create a torch dataset from our matrix of interactions data

The dataset TowerTrain get_item for each index inputs the user_id and item_id (which will be our positive feedback) from the interaction dataset : (positive_item_id)
Additionally for this user user_id, we generate an additional number of random item_id which will be the negative samples, which the user hasn't watched
Both of these are concatenated into a single array vector (items)
Lastly we also return the labels, corresponding to either the positive (1) or negative (0) sample id

In [8]:

Copied!





from torch.utils.data import Dataset, DataLoader

class TowerTrain(Dataset):
    
    def __init__(self, 
                 data, 
                 num_negatives=10, 
                 i_features=None, 
                 u_features=None):

        # user, item
        self.data = data[[config.USER_COL,config.ITEM_COL]].to_numpy()
        self.num_negatives = num_negatives
        self.num_items = len(np.unique(self.data[:, 1]))
        self.i_features = i_features
        self.u_features = u_features

    def __len__(self):
        return len(self.data)

    # get item of row in data
    def __getitem__(self, idx):

        # index to -> user_id, item_id
        user_id, positive_item_id = self.data[idx, 0], self.data[idx, 1]

        # create positive, negative samples
        # torch tensor for each item_id (pos sample) create 10 neg samples
        items = torch.tensor(np.hstack([positive_item_id,
                                       np.random.randint(
                                           low=0,
                                           high=self.num_items,
                                           size=self.num_negatives)]),
                             dtype=torch.int32)

        # set all labels to 0
        labels = torch.zeros(self.num_negatives + 1, dtype=torch.float32)
        labels[0] = 1. # positive label

        return {'user_ids': torch.tensor([user_id], dtype=torch.int32),
                'item_ids': items,
                'labels': labels}
from torch.utils.data import Dataset, DataLoader

class TowerTrain(Dataset):
    
    def __init__(self, 
                 data, 
                 num_negatives=10, 
                 i_features=None, 
                 u_features=None):

        # user, item
        self.data = data[[config.USER_COL,config.ITEM_COL]].to_numpy()
        self.num_negatives = num_negatives
        self.num_items = len(np.unique(self.data[:, 1]))
        self.i_features = i_features
        self.u_features = u_features

    def __len__(self):
        return len(self.data)

    # get item of row in data
    def __getitem__(self, idx):

        # index to -> user_id, item_id
        user_id, positive_item_id = self.data[idx, 0], self.data[idx, 1]

        # create positive, negative samples
        # torch tensor for each item_id (pos sample) create 10 neg samples
        items = torch.tensor(np.hstack([positive_item_id,
                                       np.random.randint(
                                           low=0,
                                           high=self.num_items,
                                           size=self.num_negatives)]),
                             dtype=torch.int32)

        # set all labels to 0
        labels = torch.zeros(self.num_negatives + 1, dtype=torch.float32)
        labels[0] = 1. # positive label

        return {'user_ids': torch.tensor([user_id], dtype=torch.int32),
                'item_ids': items,
                'labels': labels}

We create the dataset and dataloaders containing a batch size of 1024 rows, we'll define the training dataloader and show a batch sample output batch, which is the result of the __forward__ method pass for the number of items in the batch

In [9]:

Copied!





# create dataset
ds_train = TowerTrain(study.train)

# create data loader
dl_train = DataLoader(ds_train,
                          batch_size=2,
                          shuffle=True,
                          num_workers=0)

batch = next(iter(dl_train))
batch
# create dataset
ds_train = TowerTrain(study.train)

# create data loader
dl_train = DataLoader(ds_train,
                          batch_size=2,
                          shuffle=True,
                          num_workers=0)

batch = next(iter(dl_train))
batch

Out[9]:

{'user_ids': tensor([[ 163],
         [3579]], dtype=torch.int32),
 'item_ids': tensor([[1036, 3582, 3437, 3454, 1346, 1122, 1766, 3089, 2154, 1147, 1593],
         [1806, 3286, 1761,   96, 2161, 2686,   47,   73, 1568,  942, 2272]],
        dtype=torch.int32),
 'labels': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])}

7 | Model Definition¶

We will be creating a subclass SimpleTower, which only includes the embeddings of both user_id and item_id

The forward method, when called simply returns the user/item row of the corresponding embedding matrix
Calculates the dot product between the user_id & item_id matrices

In [10]:

Copied!





# subclass contains only embedding layer but we can 
# expand on this by importing user, item features
class SimpleTower(nn.Module):
    def __init__(self, num_embeddings, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, emb_dim)

    def forward(self, ids, features=None):
        return self.emb(ids)


class BaseTwoHead(nn.Module):
    
    def __init__(self, 
                 emb_dim, 
                 user_config=None,
                 item_config=None):
        
        super().__init__()
        self.emb_dim = emb_dim
        self.user_tower = SimpleTower(emb_dim=emb_dim, **user_config) # (emb_dim,n_users)
        self.item_tower = SimpleTower(emb_dim=emb_dim, **item_config) # (emb_dim,n_items)

    # forward method defines two 'towers'
    # and the scalar product of the two
    # which will gives us the scores
    def forward(self, batch):
        item_emb = self.item_tower(batch["item_ids"]) # (batch,1,16) 
        user_emb = self.user_tower(batch["user_ids"]) # (batch,11,16)
        dot_product = (user_emb * item_emb).sum(dim=-1) # (batch,11)
        return dot_product

    # methods for extracting embeddings
    def infer_users(self, batch):
        return self.user_tower(batch["user_ids"])

    def infer_items(self, batch):
        return self.item_tower(batch["item_ids"])
# subclass contains only embedding layer but we can 
# expand on this by importing user, item features
class SimpleTower(nn.Module):
    def __init__(self, num_embeddings, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, emb_dim)

    def forward(self, ids, features=None):
        return self.emb(ids)


class BaseTwoHead(nn.Module):
    
    def __init__(self, 
                 emb_dim, 
                 user_config=None,
                 item_config=None):
        
        super().__init__()
        self.emb_dim = emb_dim
        self.user_tower = SimpleTower(emb_dim=emb_dim, **user_config) # (emb_dim,n_users)
        self.item_tower = SimpleTower(emb_dim=emb_dim, **item_config) # (emb_dim,n_items)

    # forward method defines two 'towers'
    # and the scalar product of the two
    # which will gives us the scores
    def forward(self, batch):
        item_emb = self.item_tower(batch["item_ids"]) # (batch,1,16) 
        user_emb = self.user_tower(batch["user_ids"]) # (batch,11,16)
        dot_product = (user_emb * item_emb).sum(dim=-1) # (batch,11)
        return dot_product

    # methods for extracting embeddings
    def infer_users(self, batch):
        return self.user_tower(batch["user_ids"])

    def infer_items(self, batch):
        return self.item_tower(batch["item_ids"])

Let's check the contents of the model, if we utilise an embedding size of 16

In [11]:

Copied!





# model parameters
embed_config = {'emb_dim' : 16}  # embedding dimension
user_config = {'num_embeddings' : study.train[config.USER_COL].max() + 1,} # number of users
item_config = {'num_embeddings' : study.train[config.ITEM_COL].max() + 1,} # number of items

# import the embedding dimension 
model = BaseTwoHead(**embed_config, 
                    user_config=user_config, 
                    item_config=item_config)
model
# model parameters
embed_config = {'emb_dim' : 16}  # embedding dimension
user_config = {'num_embeddings' : study.train[config.USER_COL].max() + 1,} # number of users
item_config = {'num_embeddings' : study.train[config.ITEM_COL].max() + 1,} # number of items

# import the embedding dimension 
model = BaseTwoHead(**embed_config, 
                    user_config=user_config, 
                    item_config=item_config)
model

Out[11]:

BaseTwoHead(
  (user_tower): SimpleTower(
    (emb): Embedding(5400, 16)
  )
  (item_tower): SimpleTower(
    (emb): Embedding(3662, 16)
  )
)

Model forward pass

The output of the model will give us the logits for each of the 11 items, for each user row

In [12]:

Copied!

# output for a single batch
output = model(batch)
output
# output for a single batch
output = model(batch)
output

Out[12]:

tensor([[  1.6632,   5.8888,   0.0997,   7.6885,   8.2156,   4.0495,   3.0272,
           1.9775,  -1.8750,   4.3952,   0.2714],
        [  5.3873, -10.4797,  -4.2230,  -0.4488,   0.9215,  -5.0823,  -0.5018,
           4.9579,   0.8251,  -6.3608,  -4.5723]], grad_fn=<SumBackward1>)

Output size

In [13]:

Copied!

output.size()
output.size()

Out[13]:

torch.Size([2, 11])

8 | Extracting Embeddings¶

We can extract the embeddings for both the user and items using the following method and use it for inference. In this type of simple model, it won't be such a big problem to extract it anyway. Below is an example for a batch size of two

In [14]:

Copied!

# extract embeddings from model (here items)
i_embeddings = model.infer_items(batch)
i_embeddings.size()
# extract embeddings from model (here items)
i_embeddings = model.infer_items(batch)
i_embeddings.size()

Out[14]:

torch.Size([2, 11, 16])

In [15]:

Copied!

u_embeddings = model.infer_users(batch)
u_embeddings.size()
u_embeddings = model.infer_users(batch)
u_embeddings.size()

Out[15]:

torch.Size([2, 1, 16])

9 | Training the network¶

We need to define an optimiser, loss function and the datasets in the form of a data loader. As we are setting up the problem as a binary classification problem, we'll be using BCEWithLogitsLoss

In [16]:

Copied!





optimizer = torch.optim.Adam(model.parameters(),
                             lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

# create train dataset
ds_train = TowerTrain(study.train)

# create train data loader
dl_train = DataLoader(ds_train,
                      batch_size=1024,
                      shuffle=True,
                      num_workers=0)

# create test dataset
ds_test = TowerTrain(study.test)

# create test data loader
dl_test = DataLoader(ds_test,
                      batch_size=1024,
                      shuffle=True,
                      num_workers=0)
optimizer = torch.optim.Adam(model.parameters(),
                             lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

# create train dataset
ds_train = TowerTrain(study.train)

# create train data loader
dl_train = DataLoader(ds_train,
                      batch_size=1024,
                      shuffle=True,
                      num_workers=0)

# create test dataset
ds_test = TowerTrain(study.test)

# create test data loader
dl_test = DataLoader(ds_test,
                      batch_size=1024,
                      shuffle=True,
                      num_workers=0)

In [17]:

Copied!





train_loss_per_epoch = []
test_loss_per_epoch = []

# loop through all epochs
for epoch in tqdm(range(config.NUM_EPOCHS)):

    # training loop for all batches
    model.train()
    train_loss = 0.0
    for iteration, batch in enumerate(dl_train):
        optimizer.zero_grad()
        preds = model(batch)
        loss = loss_fn(preds, batch['labels'])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(dl_train)

    # evaluation loop for all batches
    model.eval()
    test_loss = 0
    for iteration, batch in enumerate(dl_test):
        preds = model(batch)
        loss = loss_fn(preds, batch['labels'])
        test_loss += loss.item()

    # evaluation of loss
    test_loss /= len(dl_test)
    test_loss_per_epoch.append(test_loss)
    train_loss_per_epoch.append(train_loss)
train_loss_per_epoch = []
test_loss_per_epoch = []

# loop through all epochs
for epoch in tqdm(range(config.NUM_EPOCHS)):

    # training loop for all batches
    model.train()
    train_loss = 0.0
    for iteration, batch in enumerate(dl_train):
        optimizer.zero_grad()
        preds = model(batch)
        loss = loss_fn(preds, batch['labels'])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(dl_train)

    # evaluation loop for all batches
    model.eval()
    test_loss = 0
    for iteration, batch in enumerate(dl_test):
        preds = model(batch)
        loss = loss_fn(preds, batch['labels'])
        test_loss += loss.item()

    # evaluation of loss
    test_loss /= len(dl_test)
    test_loss_per_epoch.append(test_loss)
    train_loss_per_epoch.append(train_loss)

  0%|          | 0/30 [00:00<?, ?it/s]

Save our trained model for later use!

In [18]:

Copied!

# save our model state
torch.save(model.state_dict(), f"/content/model_{config.NUM_EPOCHS}")
# save our model state
torch.save(model.state_dict(), f"/content/model_{config.NUM_EPOCHS}")

8 | Generating user recommendations¶

Time has come to use our trained model!

We will be making recommendations by using the model that we trained on the train dataset and using the test users to make predictions
To make predictions, we will extract the embedding matrix weights for user and items, calculate the scores, get the top k results for each user based on the largest score values

8.1. Load Weights¶

First things first, we need to load the model weights

In [19]:

Copied!





import warnings; warnings.filterwarnings('ignore')

model = BaseTwoHead(**embed_config, 
                    user_config=user_config, 
                    item_config=item_config)
model.load_state_dict(torch.load(f"/content/model_{config.NUM_EPOCHS}"))
model.eval()
import warnings; warnings.filterwarnings('ignore')

model = BaseTwoHead(**embed_config, 
                    user_config=user_config, 
                    item_config=item_config)
model.load_state_dict(torch.load(f"/content/model_{config.NUM_EPOCHS}"))
model.eval()

Out[19]:

BaseTwoHead(
  (user_tower): SimpleTower(
    (emb): Embedding(5400, 16)
  )
  (item_tower): SimpleTower(
    (emb): Embedding(3662, 16)
  )
)

8.2. Get test users¶

Get the user identifiers that are in the test test, the test set was saved in study.test

In [20]:

Copied!

test_users = study.test[[config.USER_COL]].drop_duplicates().reset_index(drop=True)
test_users.head()
test_users = study.test[[config.USER_COL]].drop_duplicates().reset_index(drop=True)
test_users.head()

Out[20]:

	user_id
0	1238
1	0
2	2146
3	1380
4	3180

8.3. Extract Weights¶

Extract the embedding weights for all users and items which is located in the model

In [21]:

Copied!





# extract the user / item embedding weights
user_embed = model.user_tower.emb.weight.detach().cpu().numpy()
item_embed = model.item_tower.emb.weight.detach().cpu().numpy()
user_embed.shape, item_embed.shape
# extract the user / item embedding weights
user_embed = model.user_tower.emb.weight.detach().cpu().numpy()
item_embed = model.item_tower.emb.weight.detach().cpu().numpy()
user_embed.shape, item_embed.shape

Out[21]:

((5400, 16), (3662, 16))

8.4. Scalar product¶

Calculate the scores for each user & item combination by calculating the scalar product of them

In [22]:

Copied!

# calcualate the scores (751,1616)
scores = user_embed[test_users[config.USER_COL].values] @ item_embed.T
print(scores)
# calcualate the scores (751,1616)
scores = user_embed[test_users[config.USER_COL].values] @ item_embed.T
print(scores)

[[-8.98283534e-03 -2.23088098e+00 -1.21248198e+00 ... -5.93518734e+00
  -6.74669886e+00 -3.57267737e+00]
 [ 1.21680886e-01 -1.92409587e+00 -2.67904949e+00 ... -5.99316978e+00
  -3.68449354e+00 -1.20745528e+00]
 [-3.68786573e-01 -3.50460958e+00 -3.39616084e+00 ... -4.76325846e+00
  -2.48652792e+00 -9.43809271e-01]
 ...
 [ 5.35706937e-01 -3.02921438e+00 -2.88318610e+00 ... -3.14549780e+00
  -3.66571522e+00 -1.52589762e+00]
 [-6.87072992e-01 -1.82784998e+00 -2.26169515e+00 ... -6.18041801e+00
  -5.24198675e+00 -3.18532085e+00]
 [-7.03839183e-01 -1.79352736e+00 -2.59766245e+00 ... -9.07467270e+00
  -7.98052073e+00 -4.78053665e+00]]

8.5. Get highest scores¶

Get the highest value indicies (idx) & their corresponding values (scores). The scores correspond to the index of the item in the encoder encoder_item, which we stored in class instance study

In [23]:

Copied!





# get top 10 idx by value & get its value
ids = np.argpartition(scores, -config.K)[:, -config.K:]
scores = np.take_along_axis(scores, ids, axis=1)
scores[:5]
# get top 10 idx by value & get its value
ids = np.argpartition(scores, -config.K)[:, -config.K:]
scores = np.take_along_axis(scores, ids, axis=1)
scores[:5]

Out[23]:

array([[ 0.37186688,  0.3992171 ,  0.4394618 ,  0.4884671 ,  0.5174091 ,
         0.87457657,  0.6083931 ,  0.58975196,  0.8642359 ,  0.78722566],
       [ 0.4257152 ,  0.4830652 ,  0.50204533,  0.50319844,  0.6602011 ,
         0.8257402 ,  0.8278695 ,  0.69933504,  0.88355327,  0.8429366 ],
       [ 0.9651828 ,  0.9905893 ,  1.058987  ,  1.0769    ,  1.2267288 ,
         1.4366896 ,  1.284672  ,  1.255043  ,  1.2931157 ,  1.2413594 ],
       [-0.11466098, -0.10546018, -0.1026658 , -0.09919392, -0.0560105 ,
        -0.03253222,  0.02805819, -0.07913139,  0.00199754, -0.0307106 ],
       [ 0.5338577 ,  0.558586  ,  0.6305356 ,  0.6141302 ,  0.632113  ,
         0.7719943 ,  0.6591141 ,  0.8380035 ,  0.65727115,  0.7047775 ]],
      dtype=float32)

8.6. Recommendations Matrix¶

Prepare the usual format, user_id, item_id and rating rating, which will enable us to quickly evaluate the metrics using experiment function from replay. We need to add both lists to each user & expand them together

In [24]:

Copied!





# prepare recommendations matrix
def prepare_recs(test_users, 
                 rec_item_ids, 
                 rec_relevances):
    
    predict = test_users.copy()
    predict[config.ITEM_COL] = rec_item_ids.tolist()  # add list of indicies for each user
    predict['rating'] = rec_relevances.tolist() # add rating list of scores for each user
    predict = predict.explode(column=[config.ITEM_COL, 'rating']).reset_index(drop=True) # expand both lists
    predict[config.ITEM_COL] = predict[config.ITEM_COL].astype(int)
    predict['rating'] = predict['rating'].astype("double")
    return predict


model_recommendations = prepare_recs(test_users,      # user columns 
                                     rec_item_ids=ids,  # indicies of top 10 in scores
                                     rec_relevances=scores) # scores of top 10
model_recommendations
# prepare recommendations matrix
def prepare_recs(test_users, 
                 rec_item_ids, 
                 rec_relevances):
    
    predict = test_users.copy()
    predict[config.ITEM_COL] = rec_item_ids.tolist()  # add list of indicies for each user
    predict['rating'] = rec_relevances.tolist() # add rating list of scores for each user
    predict = predict.explode(column=[config.ITEM_COL, 'rating']).reset_index(drop=True) # expand both lists
    predict[config.ITEM_COL] = predict[config.ITEM_COL].astype(int)
    predict['rating'] = predict['rating'].astype("double")
    return predict


model_recommendations = prepare_recs(test_users,      # user columns 
                                     rec_item_ids=ids,  # indicies of top 10 in scores
                                     rec_relevances=scores) # scores of top 10
model_recommendations

Out[24]:

	user_id	item_id	rating
0	1238	2133	0.371867
1	1238	1198	0.399217
2	1238	346	0.439462
3	1238	1156	0.488467
4	1238	2620	0.517409
...	...	...	...
11215	5309	1115	0.197690
11216	5309	2130	0.171694
11217	5309	2480	0.256298
11218	5309	1505	0.161175
11219	5309	1795	0.279239

11220 rows × 3 columns

We'll evaluate the prediction & test overlapping items using hitrate, to measure how well the model predicts at least one relevant recommendation for users. NDCG, for the evaluation of how well the model can correcly order the relevant items & coverage to measure how well the model predicts a range of items from all available items

In [25]:

Copied!





metrics = Experiment(
    [NDCG(config.K), HitRate(config.K), Coverage(config.K)],
    study.test,
    study.train,
    query_column=config.USER_COL, 
    item_column=config.ITEM_COL,
)
metrics = Experiment(
    [NDCG(config.K), HitRate(config.K), Coverage(config.K)],
    study.test,
    study.train,
    query_column=config.USER_COL, 
    item_column=config.ITEM_COL,
)

In [26]:

Copied!

metrics.add_result("dssm_model", model_recommendations)
metrics.results
metrics.add_result("dssm_model", model_recommendations)
metrics.results

Out[26]:

	NDCG@10	HitRate@10	Coverage@10
dssm_model	0.058016	0.317291	0.231294