import pickle
import scipy.sparse as sp
from tqdm import tqdm
import os
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import pandas as pd
from lightfm import LightFM
from implicit.nearest_neighbours import BM25Recommender
from copy import deepcopy
from sklearn.model_selection import train_test_split
DATA_PATH = "rekko_sand_rekko"
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
%load_ext Cython
%%cython
def average_precision(
dict data_true,
dict data_predicted,
const unsigned long int k
) -> float:
cdef:
unsigned long int n_items_predicted
unsigned long int n_items_true
unsigned long int n_correct_items
unsigned long int item_idx
double average_precision_sum
double precision
set items_true
list items_predicted
if not data_true:
raise ValueError('data_true is empty')
average_precision_sum = 0.0
for key, items_true in data_true.items():
items_predicted = data_predicted.get(key, [])
n_items_true = len(items_true)
n_items_predicted = min(len(items_predicted), k)
if n_items_true == 0 or n_items_predicted == 0:
continue
n_correct_items = 0
precision = 0.0
for item_idx in range(n_items_predicted):
if items_predicted[item_idx] in items_true:
n_correct_items += 1
precision += <double>n_correct_items / <double>(item_idx + 1)
average_precision_sum += <double>precision / <double>min(n_items_true, k)
return average_precision_sum / <double>len(data_true)
def metric(true_data, predicted_data, k=20):
true_data_set = {k: set(v) for k, v in true_data.items()}
return average_precision(true_data_set, predicted_data, k=k)
transactions = pickle.load(open(os.path.join(DATA_PATH, "transactions.pkl"), "rb"))
transactions = transactions[["ts", "rating", "label"]]
transactions.reset_index([0, 1], inplace=True)
transactions.element_uid = transactions.element_uid.astype(np.int16)
transactions.user_uid = transactions.user_uid.astype(np.int32)
transactions.rating = transactions.rating.astype(float)
bookmarks = pd.read_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))
bookmarks.element_uid = bookmarks.element_uid.astype(np.int16)
bookmarks.user_uid = bookmarks.user_uid.astype(np.int32)
user_encoder = LabelEncoder()
user_encoder.fit(transactions.user_uid)
element_encoder = LabelEncoder()
element_encoder.fit(transactions.element_uid)
q8 = transactions.ts.quantile(0.8)
class DataPart:
def __init__(self, transactions, bookmarks, qs, qe=1):
q1 = transactions.ts.quantile(qs)
q2 = transactions.ts.quantile(qe)
#разобьем интеракции по времени на train и test
transactions_train = transactions[transactions.ts <= q1]
transactions_test = transactions[(transactions.ts > q1) & (transactions.ts <= q2)]
bookmarks = bookmarks[bookmarks.ts <= q1][["user_uid", "element_uid"]]
train_users = transactions_train.user_uid.unique()
transactions_test = transactions_test[transactions_test.user_uid.isin(train_users)]
transactions_train.user_uid = user_encoder.transform(transactions_train.user_uid).astype(np.int32)
transactions_test.user_uid = user_encoder.transform(transactions_test.user_uid).astype(np.int32)
bookmarks.user_uid = user_encoder.transform(bookmarks.user_uid).astype(np.int32)
transactions_train.element_uid = element_encoder.transform(transactions_train.element_uid).astype(np.int16)
transactions_test.element_uid = element_encoder.transform(transactions_test.element_uid).astype(np.int16)
bookmarks.element_uid = element_encoder.transform(bookmarks.element_uid).astype(np.int16)
#delete_interactions_df - для фильтрации интеракций при валидации моделей первого уровня
delete_interactions_df = transactions_train[transactions_train.label != -1].drop_duplicates()
# соберем признаки - дата первой интеракци, дата последней интеракции
min_time = transactions_train.ts.min()
transactions_train.ts = transactions_train.ts.apply(lambda x: round((x - min_time) / (3600 * 12)))
user_ts = pd.DataFrame()
user_ts["min_ts"] = transactions_train.groupby("user_uid")["ts"].min()
user_ts["max_ts"] = transactions_train.groupby("user_uid")["ts"].max()
max_abs = user_ts.max_ts.max()
user_ts["diff_ts"] = user_ts.max_ts - user_ts.min_ts
user_ts = user_ts / max_abs
self.user_ts = user_ts.copy()
transactions_train.drop(columns=["ts"], inplace=True)
transactions_test.drop(columns=["ts"], inplace=True)
#просуммируем рейтинг по разным интеракиям.
#Например, фильм могли сначала добавить в закладки, а затем посмотреть, и поставить оценку
transactions_train.rating = transactions_train.groupby(["user_uid", "element_uid"],
as_index=True)["rating"].transform("sum")
transactions_train.label = transactions_train.groupby(["element_uid", "user_uid"],
as_index=True)["label"].transform("max")
transactions_train = transactions_train.drop_duplicates()
transactions_test.rating = transactions_test.groupby(["user_uid", "element_uid"],
as_index=True)["rating"].transform("sum")
transactions_test.label = transactions_test.groupby(["element_uid", "user_uid"],
as_index=True)["label"].transform("max")
transactions_test = transactions_test.drop_duplicates()
#выбрем из закладок те, которые пользователь не успел просмотреть за период в train.
transactions_made = transactions_train[transactions_train.label != -1].set_index(["user_uid", "element_uid"])
bookmarks.set_index(["user_uid", "element_uid"], inplace=True)
bookmarks = bookmarks.loc[bookmarks.index.difference(transactions_made.index)]
bookmarks_set = bookmarks.reset_index([0, 1]).groupby("user_uid")["element_uid"].apply(set).to_dict()
# исключим из выборки пользователей, которые не потребили ни одной единицы контента
user_cnt = transactions_test[transactions_test.label == 1].user_uid.value_counts()
user_rear = set(user_cnt[user_cnt == 1].index)
self.user_cnt = user_cnt.to_dict().copy()
users_test = set(transactions_test[transactions_test.label == 1].user_uid.unique().astype(np.int32))
users_test = sorted(list(users_test.difference(user_rear)))
elements_test = list(range(len(element_encoder.classes_)))
#Удалим контент, кооторый посмотрели менее 100 раз.
element_cnt = transactions_train.element_uid.value_counts()
rear_train = set(list(element_cnt[element_cnt < 100].index))
# user_test_filter - для фильтрации топа, после моделей первого уровня
user_test_filter = delete_interactions_df[delete_interactions_df.user_uid.isin(users_test)].\
groupby("user_uid")["element_uid"].apply(set).to_dict()
user_test_filter = {user: user_test_filter.get(user, set()).union(rear_train).difference(bookmarks_set.get(user, set())) \
for user in users_test}
self.user_test_filter = user_test_filter.copy()
#user_test_true - для валидации модели второго уровня
user_test_true = transactions_test[transactions_test.label == 1].groupby("user_uid")["element_uid"].apply(set)
user_test_true_df = transactions_test[transactions_test.label==1][["user_uid", "element_uid", "label"]].drop_duplicates()
user_test_true_df.set_index(["user_uid", "element_uid"], inplace=True)
self.user_test_true_df = user_test_true_df.copy()
self.user_test_true = user_test_true.to_dict()
#sparse матрицы для обучения/валидации моделей первого уровня
self.X_train = sp.coo_matrix((list(transactions_train.rating + 0.01),
(list(transactions_train.user_uid), list(transactions_train.element_uid))))
self.X_del = sp.coo_matrix((list(delete_interactions_df.rating),
(list(delete_interactions_df.user_uid), list(delete_interactions_df.element_uid))))
which_test = transactions_test.label == 1
self.X_test = sp.coo_matrix((list(transactions_test[which_test].rating),
(list(transactions_test[which_test].user_uid),
list(transactions_test[which_test].element_uid))))
self.transactions_test = transactions_test.copy()
self.transactions_train = transactions_train.copy()
self.delete_interactions_df = delete_interactions_df.copy()
self.users_test = users_test.copy()
self.elements_test = elements_test.copy()
self.bookmarks = bookmarks.copy()
self.rear_train = rear_train.copy()
data_q8 = DataPart(transactions, bookmarks, 0.8)
# Вычеслиения map для LightFM
def compute_map(data):
ranks = data.model_mf.predict_rank(data.X_test, data.X_del, num_threads=80, check_intersections=False)
mask = ranks.copy()
mask.data = np.less(mask.data, 20, mask.data)
ranks.data += 1
ranks.data = ranks.data * mask.data
ranks.eliminate_zeros()
ranks = ranks.tolil().data
average_precision_sum = 0.0
for x in data.indices:
n_correct_items = 0
precision = 0
for y in sorted(ranks[x]):
n_correct_items += 1
precision += n_correct_items / y
average_precision_sum += precision / min(data.total[x], 20)
average_precision_sum /= len(data.indices)
return average_precision_sum
def train_lfm(data):
maps = []
data.model_mf = LightFM(no_components=200, loss='warp', learning_schedule='adagrad', user_alpha=6e-5,
item_alpha=2e-5, learning_rate=0.01, max_sampled=150)
data.total = data.X_test.getnnz(axis=1)
data.indices = np.nonzero(data.total)[0]
for i in tqdm(range(10)):
data.model_mf.fit_partial(data.X_train, sample_weight=data.X_train, epochs=10, num_threads=80)
maps.append(compute_map(data))
print(maps)
train_lfm(data_q8)
def make_bm25(data):
data.model_bm25 = BM25Recommender(K=150, B=0.8)
data.model_bm25.fit(data.X_train.T.tocsr().astype(np.float32))
make_bm25(data_q8)
def get_bm25_df(data, topk=200):
data.X_train = data.X_train.tocsr()
data.bm25_dict = dict()
data.bm25_pairs = []
data.bm25_user_sets = dict()
for user in tqdm(data.users_test):
#
rec_current = data.model_bm25.recommend(user, data.X_train, N=10000,
filter_already_liked_items=False)
current_extend = []
current_set = set()
filter_current = set(data.user_test_filter.get(user, []))
for rank, rec in enumerate(rec_current):
# дополнительно отфильтруем рекомендации с неположительным скором
if not rec[0] in filter_current and rec[1] > 0:
data.bm25_dict[(user, rec[0])] = (rec[1], rank + 1)
if len(current_extend) < topk:
current_extend.append((user, rec[0]))
current_set.add(rec[0])
if len(current_extend) >= topk:
break
data.bm25_pairs.extend(current_extend)
data.bm25_user_sets[user] = current_set
get_bm25_df(data_q8, topk=200)
def get_lfm_df(data, topk=200, test_mode=False):
user_biases = data.model_mf.user_biases[data.users_test]
item_biases = data.model_mf.item_biases[data.elements_test]
user_embeddings = data.model_mf.user_embeddings[data.users_test]
item_embeddings = data.model_mf.item_embeddings[data.elements_test]
lightfm_dot_product = user_embeddings.dot(item_embeddings.T)
lightfm_prediction = lightfm_dot_product + user_biases.reshape(-1, 1) + item_biases.reshape(1, -1)
lightfm_prediction_elements = (-lightfm_prediction).argsort(axis=1)
lightfm_prediction_values = -np.sort(-lightfm_prediction, axis=1)
elements_lightfm = dict(list(zip(data.users_test, lightfm_prediction_elements)))
values_lightfm = dict(list(zip(data.users_test, lightfm_prediction_values)))
data.user_biases_series = pd.Series(user_biases, index=data.users_test, name="user_bias")
data.item_biases_series = pd.Series(item_biases, index=data.elements_test, name="item_bias")
data.lighfm_dict = dict()
data.lightfm_pairs = []
for user_id, user in tqdm(enumerate(data.users_test)):
current_extend = []
current_values = values_lightfm[user]
filter_current = data.user_test_filter.get(user, set())
for rank, (element, value) in enumerate(zip(elements_lightfm[user], current_values)):
if not element in filter_current:
if len(current_extend) < topk:
data.lighfm_dict[(user, element)] = (value, rank + 1)
current_extend.append((user, element))
if len(current_extend) >= topk:
break
data.lightfm_pairs.extend(current_extend)
data.user_embeddings = pd.DataFrame(data.model_mf.user_embeddings[data.users_test], index=data.users_test)
get_lfm_df(data_q8, topk=200)
with open(os.path.join("data", 'catalogue.json'), 'r') as f:
catalogue = json.load(f)
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()
for feature in ["purchase", "rent", "subscription"]:
catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)
catalog.duration += 5
type_encoder = LabelEncoder()
catalog["type"] = type_encoder.fit_transform(catalog["type"])
for column in ["duration", "feature_1", "feature_2", "feature_3", "feature_4", "feature_5"]:
catalog[column] = catalog[column].astype(float)
catalog.drop(columns=["feature_1"], inplace=True)
def final_join(data):
all_pairs = set(data.bm25_pairs).union(set(data.lightfm_pairs))
data.user_test_true_df = data.user_test_true_df.label.to_dict()
data.bookmarks["bookmark"] = 1
data.bookmarks = data.bookmarks.bookmark.to_dict()
X = []
for pair in tqdm(all_pairs):
current = pair + data.bm25_dict.get(pair, (np.nan, np.nan)) + data.lighfm_dict.get(pair, (np.nan, np.nan))
current = current + (data.user_test_true_df.get(pair, 0), ) + (data.bookmarks.get(pair, 0), )
X.append(current)
X = pd.DataFrame(X, columns = ["user_uid", "element_uid", "bm25_v", "bm25_r",
"lfm_v", "lfm_r", "label", "bookmark"])
X.bm25_r.fillna(2000, inplace=True)
X.lfm_r.fillna(2000, inplace=True)
catalog_copy = catalog.copy()
catalog_copy = catalog_copy.loc[catalog_copy.index.intersection(pd.Index(element_encoder.classes_))]
catalog_copy.index.names = ["element_uid"]
catalog_copy.reset_index(0, inplace=True)
catalog_copy.element_uid = element_encoder.transform(catalog_copy.element_uid).astype(np.int16)
X = X.merge(catalog_copy, on="element_uid", how="left")
X = X.merge(data.user_biases_series, left_on="user_uid", right_index=True)
X = X.merge(data.item_biases_series, left_on="element_uid", right_index=True)
X = X.merge(data.user_embeddings, left_on="user_uid", right_index=True, how="left")
X = X.merge(data.user_ts, left_on="user_uid", right_index=True, how="left")
user_with_pos = X[X.label == 1].user_uid.unique()
element_with_pos = X[X.label == 1].element_uid.unique()
X = X[X.user_uid.isin(user_with_pos)]
X = X[X.element_uid.isin(element_with_pos)]
X.set_index("element_uid", inplace=True)
return X
X = final_join(data_q8)
users_train, users_test = train_test_split(X.user_uid.unique(), random_state = 42)
X_train = X[X.user_uid.isin(users_train)]
X_test = X[X.user_uid.isin(users_test)]
y_train = X_train.pop("label")
y_test = X_test.pop("label")
X_train = X_train.reset_index(0).set_index(["user_uid", "element_uid"])
X_test = X_test.reset_index(0).set_index(["user_uid", "element_uid"])
import lightgbm
train_data = lightgbm.Dataset(X_train, y_train)
test_data = lightgbm.Dataset(X_test, y_test)
parameters = {
'application': 'binary',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.05,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"min_data_in_leaf": 80,
"feature_fraction": 0.8,
'verbose': 1,
"num_threads": 20,
"lambda_l1": 0.5,
"lambda_l2": 0.1,
}
model = lightgbm.train(parameters,
train_data,
categorical_feature=categorical_feature,
valid_sets=test_data,
num_boost_round=3000,
early_stopping_rounds=100,
verbose_eval=10)
X_test["lgb_score"] = model.predict(X_test, num_iteration=model.best_iteration)
lgb_res = X_test.reset_index([0, 1])[["user_uid",
"element_uid",
"lgb_score"]].sort_values("lgb_score",
ascending=False)
user_elements = dict()
for user, group in tqdm(lgb_res.groupby("user_uid")):
user_elements[user] = list(group.element_uid)[:20]
#test_true = {user:data_q8.user_test_true[user] for user in users_test}
#metric(test_true, user_elements)
#model.save_model("model.txt")