In [21]:
import pickle
import scipy.sparse as sp
from tqdm import tqdm
import os
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import pandas as pd
from lightfm import LightFM

from implicit.nearest_neighbours import BM25Recommender
from copy import deepcopy
from sklearn.model_selection import train_test_split
In [2]:
DATA_PATH = "rekko_sand_rekko"
In [3]:
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
In [4]:
%load_ext Cython

In [22]:
%%cython

def average_precision(
        dict data_true,
        dict data_predicted,
        const unsigned long int k
) -> float:
    cdef:
        unsigned long int n_items_predicted
        unsigned long int n_items_true
        unsigned long int n_correct_items
        unsigned long int item_idx

        double average_precision_sum
        double precision

        set items_true
        list items_predicted

    if not data_true:
        raise ValueError('data_true is empty')

    average_precision_sum = 0.0

    for key, items_true in data_true.items():
        items_predicted = data_predicted.get(key, [])

        n_items_true = len(items_true)
        n_items_predicted = min(len(items_predicted), k)

        if n_items_true == 0 or n_items_predicted == 0:
            continue

        n_correct_items = 0
        precision = 0.0

        for item_idx in range(n_items_predicted):
            if items_predicted[item_idx] in items_true:
                n_correct_items += 1
                precision += <double>n_correct_items / <double>(item_idx + 1)

        average_precision_sum += <double>precision / <double>min(n_items_true, k)

    return average_precision_sum / <double>len(data_true)

def metric(true_data, predicted_data, k=20):
    true_data_set = {k: set(v) for k, v in true_data.items()}

    return average_precision(true_data_set, predicted_data, k=k)

Prepare data

In [6]:
transactions = pickle.load(open(os.path.join(DATA_PATH, "transactions.pkl"), "rb"))
transactions = transactions[["ts", "rating", "label"]]
transactions.reset_index([0, 1], inplace=True)
transactions.element_uid = transactions.element_uid.astype(np.int16)
transactions.user_uid = transactions.user_uid.astype(np.int32)
transactions.rating = transactions.rating.astype(float)
In [7]:
bookmarks = pd.read_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))
bookmarks.element_uid = bookmarks.element_uid.astype(np.int16)
bookmarks.user_uid = bookmarks.user_uid.astype(np.int32)
In [8]:
user_encoder = LabelEncoder()
user_encoder.fit(transactions.user_uid)
element_encoder = LabelEncoder()
element_encoder.fit(transactions.element_uid)
Out[8]:
LabelEncoder()
In [23]:
q8 = transactions.ts.quantile(0.8)
In [31]:
class DataPart:

    def __init__(self, transactions, bookmarks, qs, qe=1):
        q1 = transactions.ts.quantile(qs)
        q2 = transactions.ts.quantile(qe)
        #разобьем интеракции по времени на train и test
        transactions_train = transactions[transactions.ts <= q1]
        transactions_test = transactions[(transactions.ts > q1) & (transactions.ts <= q2)]
        
        bookmarks = bookmarks[bookmarks.ts <= q1][["user_uid", "element_uid"]]

        train_users = transactions_train.user_uid.unique()
        transactions_test = transactions_test[transactions_test.user_uid.isin(train_users)]


        transactions_train.user_uid = user_encoder.transform(transactions_train.user_uid).astype(np.int32)
        transactions_test.user_uid = user_encoder.transform(transactions_test.user_uid).astype(np.int32)
        bookmarks.user_uid = user_encoder.transform(bookmarks.user_uid).astype(np.int32)

        transactions_train.element_uid = element_encoder.transform(transactions_train.element_uid).astype(np.int16)
        transactions_test.element_uid = element_encoder.transform(transactions_test.element_uid).astype(np.int16)
        bookmarks.element_uid = element_encoder.transform(bookmarks.element_uid).astype(np.int16)
        #delete_interactions_df - для фильтрации интеракций при валидации моделей первого уровня
        delete_interactions_df = transactions_train[transactions_train.label != -1].drop_duplicates()
        
        # соберем признаки - дата первой интеракци, дата последней интеракции
        min_time = transactions_train.ts.min()
        transactions_train.ts = transactions_train.ts.apply(lambda x: round((x - min_time) / (3600 * 12)))

        user_ts = pd.DataFrame()
        user_ts["min_ts"] = transactions_train.groupby("user_uid")["ts"].min()
        user_ts["max_ts"] = transactions_train.groupby("user_uid")["ts"].max()
        max_abs = user_ts.max_ts.max()
        user_ts["diff_ts"] = user_ts.max_ts - user_ts.min_ts
        user_ts = user_ts / max_abs   
        self.user_ts = user_ts.copy()
        
        transactions_train.drop(columns=["ts"], inplace=True)
        transactions_test.drop(columns=["ts"], inplace=True)
        #просуммируем рейтинг по разным интеракиям.
        #Например, фильм могли сначала добавить в закладки, а затем посмотреть, и поставить оценку
        transactions_train.rating = transactions_train.groupby(["user_uid", "element_uid"],
                                                               as_index=True)["rating"].transform("sum")
        transactions_train.label = transactions_train.groupby(["element_uid", "user_uid"],
                                                               as_index=True)["label"].transform("max")
        transactions_train = transactions_train.drop_duplicates()
        

        transactions_test.rating = transactions_test.groupby(["user_uid", "element_uid"],
                                                               as_index=True)["rating"].transform("sum")
        transactions_test.label = transactions_test.groupby(["element_uid", "user_uid"],
                                                               as_index=True)["label"].transform("max")
        transactions_test = transactions_test.drop_duplicates()
        #выбрем из закладок те, которые пользователь не успел просмотреть за период в train.
        transactions_made = transactions_train[transactions_train.label != -1].set_index(["user_uid", "element_uid"])
        bookmarks.set_index(["user_uid", "element_uid"], inplace=True)
        bookmarks = bookmarks.loc[bookmarks.index.difference(transactions_made.index)]
        bookmarks_set = bookmarks.reset_index([0, 1]).groupby("user_uid")["element_uid"].apply(set).to_dict()
        # исключим из выборки пользователей, которые не потребили ни одной единицы контента
        user_cnt = transactions_test[transactions_test.label == 1].user_uid.value_counts()
        user_rear = set(user_cnt[user_cnt == 1].index)
        
        self.user_cnt = user_cnt.to_dict().copy()

        users_test = set(transactions_test[transactions_test.label == 1].user_uid.unique().astype(np.int32))
        users_test = sorted(list(users_test.difference(user_rear)))
        elements_test = list(range(len(element_encoder.classes_)))
        
        #Удалим контент, кооторый посмотрели менее 100 раз.
        element_cnt = transactions_train.element_uid.value_counts()
        rear_train = set(list(element_cnt[element_cnt < 100].index))
        # user_test_filter - для фильтрации топа, после моделей первого уровня
        user_test_filter = delete_interactions_df[delete_interactions_df.user_uid.isin(users_test)].\
        groupby("user_uid")["element_uid"].apply(set).to_dict()
        
        user_test_filter = {user: user_test_filter.get(user, set()).union(rear_train).difference(bookmarks_set.get(user, set())) \
                                 for user in users_test}
        
            
        self.user_test_filter = user_test_filter.copy()
        #user_test_true - для валидации модели второго уровня
        user_test_true = transactions_test[transactions_test.label == 1].groupby("user_uid")["element_uid"].apply(set)
        user_test_true_df = transactions_test[transactions_test.label==1][["user_uid", "element_uid", "label"]].drop_duplicates()
        user_test_true_df.set_index(["user_uid", "element_uid"], inplace=True)
        self.user_test_true_df = user_test_true_df.copy()

        self.user_test_true = user_test_true.to_dict()
        #sparse матрицы для обучения/валидации моделей первого уровня
        self.X_train = sp.coo_matrix((list(transactions_train.rating + 0.01),
                                (list(transactions_train.user_uid), list(transactions_train.element_uid))))

        self.X_del = sp.coo_matrix((list(delete_interactions_df.rating),
                                (list(delete_interactions_df.user_uid), list(delete_interactions_df.element_uid))))

        which_test = transactions_test.label == 1
        self.X_test = sp.coo_matrix((list(transactions_test[which_test].rating),
                                (list(transactions_test[which_test].user_uid),
                                 list(transactions_test[which_test].element_uid))))
        
        self.transactions_test = transactions_test.copy()
        self.transactions_train = transactions_train.copy()
        self.delete_interactions_df = delete_interactions_df.copy()
        self.users_test = users_test.copy()
        self.elements_test = elements_test.copy()
        self.bookmarks = bookmarks.copy()
        self.rear_train = rear_train.copy()
In [ ]:
data_q8 = DataPart(transactions, bookmarks,  0.8)
/opt/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:5159: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
/opt/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:5159: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
/opt/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py:4167: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

LightFM

In [ ]:
# Вычеслиения map для LightFM
def compute_map(data):
    ranks = data.model_mf.predict_rank(data.X_test, data.X_del, num_threads=80, check_intersections=False)
    mask = ranks.copy()
    mask.data = np.less(mask.data, 20, mask.data)
    ranks.data += 1
    ranks.data = ranks.data * mask.data
    ranks.eliminate_zeros()
    ranks = ranks.tolil().data
    average_precision_sum = 0.0
    for x in data.indices:
        n_correct_items = 0
        precision = 0
        for y in sorted(ranks[x]):
            n_correct_items += 1
            precision += n_correct_items / y
        average_precision_sum += precision / min(data.total[x], 20)
    average_precision_sum /= len(data.indices)
    return average_precision_sum
In [ ]:
def train_lfm(data):
    maps = []
    data.model_mf = LightFM(no_components=200, loss='warp', learning_schedule='adagrad', user_alpha=6e-5,
                       item_alpha=2e-5, learning_rate=0.01, max_sampled=150)
    data.total = data.X_test.getnnz(axis=1)
    data.indices = np.nonzero(data.total)[0]
    for i in tqdm(range(10)):
        data.model_mf.fit_partial(data.X_train, sample_weight=data.X_train, epochs=10, num_threads=80)
        maps.append(compute_map(data))
        print(maps)
In [ ]:
train_lfm(data_q8)

In [ ]:
def make_bm25(data):
    data.model_bm25 = BM25Recommender(K=150, B=0.8)
    data.model_bm25.fit(data.X_train.T.tocsr().astype(np.float32))
    
make_bm25(data_q8)

Соберем топ рекомендаций от моделей первого уровня

In [ ]:
def get_bm25_df(data, topk=200):
    data.X_train = data.X_train.tocsr()
    data.bm25_dict = dict()
    data.bm25_pairs = []
    data.bm25_user_sets = dict()
    for user in tqdm(data.users_test):
#       
        rec_current = data.model_bm25.recommend(user, data.X_train, N=10000,
                                                filter_already_liked_items=False)
        
        
        current_extend = []
        current_set = set()
        filter_current = set(data.user_test_filter.get(user, []))
        
        for rank, rec in enumerate(rec_current):
            # дополнительно отфильтруем рекомендации с неположительным скором
            if not rec[0] in filter_current and rec[1] > 0:
                data.bm25_dict[(user, rec[0])] = (rec[1], rank + 1)
                if len(current_extend) < topk:
                    current_extend.append((user, rec[0]))
                    current_set.add(rec[0])
                if len(current_extend) >= topk:
                    break
        data.bm25_pairs.extend(current_extend)
        data.bm25_user_sets[user] = current_set

get_bm25_df(data_q8, topk=200)
In [ ]:
def get_lfm_df(data, topk=200, test_mode=False):
    user_biases = data.model_mf.user_biases[data.users_test]
    item_biases = data.model_mf.item_biases[data.elements_test]

    user_embeddings = data.model_mf.user_embeddings[data.users_test]
    item_embeddings = data.model_mf.item_embeddings[data.elements_test]

    lightfm_dot_product = user_embeddings.dot(item_embeddings.T)
    lightfm_prediction = lightfm_dot_product + user_biases.reshape(-1, 1) + item_biases.reshape(1, -1)
    lightfm_prediction_elements = (-lightfm_prediction).argsort(axis=1)
    lightfm_prediction_values = -np.sort(-lightfm_prediction, axis=1)
    

    elements_lightfm = dict(list(zip(data.users_test, lightfm_prediction_elements)))
    values_lightfm = dict(list(zip(data.users_test, lightfm_prediction_values)))
    
    data.user_biases_series = pd.Series(user_biases, index=data.users_test, name="user_bias")
    data.item_biases_series = pd.Series(item_biases, index=data.elements_test, name="item_bias")
    
    data.lighfm_dict = dict()
    data.lightfm_pairs = []
    for user_id, user in tqdm(enumerate(data.users_test)):
        current_extend = []
        current_values = values_lightfm[user]

        filter_current = data.user_test_filter.get(user, set())

        for rank, (element, value) in enumerate(zip(elements_lightfm[user], current_values)):
            if not element in filter_current:
                if len(current_extend) < topk:
                    data.lighfm_dict[(user, element)] =  (value, rank + 1)
                    current_extend.append((user, element))
                if len(current_extend) >= topk:
                    break
        
        data.lightfm_pairs.extend(current_extend)

    data.user_embeddings = pd.DataFrame(data.model_mf.user_embeddings[data.users_test], index=data.users_test)
In [ ]:
get_lfm_df(data_q8, topk=200)
In [ ]:
with open(os.path.join("data", 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()

for feature in ["purchase", "rent", "subscription"]:
    catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)

catalog.duration += 5


type_encoder = LabelEncoder()
catalog["type"] = type_encoder.fit_transform(catalog["type"])

for column in ["duration", "feature_1", "feature_2", "feature_3", "feature_4", "feature_5"]:
    catalog[column] = catalog[column].astype(float)
catalog.drop(columns=["feature_1"], inplace=True)
In [ ]:
def final_join(data):
    all_pairs = set(data.bm25_pairs).union(set(data.lightfm_pairs))
    data.user_test_true_df = data.user_test_true_df.label.to_dict()
    data.bookmarks["bookmark"] = 1
    data.bookmarks = data.bookmarks.bookmark.to_dict()
    
    X = []
    for pair in tqdm(all_pairs):
        current = pair + data.bm25_dict.get(pair, (np.nan, np.nan)) + data.lighfm_dict.get(pair, (np.nan, np.nan))
        current = current + (data.user_test_true_df.get(pair, 0), ) + (data.bookmarks.get(pair, 0), )
        X.append(current)
    
    X = pd.DataFrame(X, columns = ["user_uid", "element_uid", "bm25_v", "bm25_r",
                                   "lfm_v", "lfm_r", "label", "bookmark"])
    X.bm25_r.fillna(2000, inplace=True)
    X.lfm_r.fillna(2000, inplace=True)
    
    catalog_copy = catalog.copy()
    catalog_copy = catalog_copy.loc[catalog_copy.index.intersection(pd.Index(element_encoder.classes_))]
    catalog_copy.index.names = ["element_uid"]
    catalog_copy.reset_index(0, inplace=True)
    catalog_copy.element_uid = element_encoder.transform(catalog_copy.element_uid).astype(np.int16)
    
    X = X.merge(catalog_copy, on="element_uid", how="left")
    X = X.merge(data.user_biases_series, left_on="user_uid", right_index=True)
    X = X.merge(data.item_biases_series, left_on="element_uid", right_index=True)
    X = X.merge(data.user_embeddings, left_on="user_uid", right_index=True, how="left")
    X = X.merge(data.user_ts, left_on="user_uid", right_index=True, how="left")

    user_with_pos = X[X.label == 1].user_uid.unique()
    element_with_pos = X[X.label == 1].element_uid.unique()
    X = X[X.user_uid.isin(user_with_pos)]
    X = X[X.element_uid.isin(element_with_pos)]
    X.set_index("element_uid", inplace=True)
    return X
In [ ]:
X = final_join(data_q8) 
In [ ]:
users_train, users_test = train_test_split(X.user_uid.unique(), random_state = 42)
In [ ]:
X_train = X[X.user_uid.isin(users_train)]
X_test = X[X.user_uid.isin(users_test)]
y_train = X_train.pop("label")
y_test = X_test.pop("label")

X_train = X_train.reset_index(0).set_index(["user_uid", "element_uid"])
X_test = X_test.reset_index(0).set_index(["user_uid", "element_uid"])
In [ ]:
import lightgbm
train_data = lightgbm.Dataset(X_train, y_train)
test_data = lightgbm.Dataset(X_test, y_test)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "min_data_in_leaf": 80,
    "feature_fraction": 0.8,
    'verbose': 1,
    "num_threads": 20,
    "lambda_l1": 0.5,
    "lambda_l2": 0.1,
}

model = lightgbm.train(parameters,
                       train_data,
                       categorical_feature=categorical_feature,
                       valid_sets=test_data,
                       num_boost_round=3000,
                       early_stopping_rounds=100,
                      verbose_eval=10)
In [ ]:
X_test["lgb_score"] = model.predict(X_test, num_iteration=model.best_iteration)
lgb_res = X_test.reset_index([0, 1])[["user_uid",
                                        "element_uid",
                                        "lgb_score"]].sort_values("lgb_score",
                                                                  ascending=False)
user_elements = dict()
for user, group in tqdm(lgb_res.groupby("user_uid")):
    user_elements[user] = list(group.element_uid)[:20]
In [ ]:
#test_true = {user:data_q8.user_test_true[user] for user in users_test}
In [ ]:
#metric(test_true, user_elements)
In [ ]:
#model.save_model("model.txt")