import pandas as pd
import os
import numpy as np
import json
import math
from tqdm import tqdm
from scipy import sparse as sp
from sklearn.model_selection import train_test_split
%matplotlib inline
DATA_PATH = "rekko_sand_rekko"
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
catalogue = json.load(f)
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()
for feature in ["purchase", "rent", "subscription"]:
catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)
catalog.duration += 5
catalog
transactions = pd.read_csv(
os.path.join(DATA_PATH, 'transactions.csv'),
dtype={
'element_uid': np.uint16,
'user_uid': np.uint32,
'consumption_mode': 'category',
'ts': np.float64,
'watched_time': np.uint64,
'device_type': np.uint8,
'device_manufacturer': np.uint8
}
)
transactions
transactions.watched_time = transactions.watched_time / 60
transactions = transactions.merge(catalog, left_on="element_uid", right_index=True, how="left")
transactions["percent_watched"] = transactions.watched_time / transactions.duration
transactions
bookmarks = pd.read_csv(
os.path.join(DATA_PATH, 'bookmarks.csv'),
dtype={
'element_uid': np.uint16,
'user_uid': np.uint32,
'ts': np.float64
}
)
bookmarks = bookmarks.merge(catalog, left_on="element_uid", right_index=True, how="left")
bookmarks["consumption_mode"] = "B"
bookmarks["percent_watched"] = 0.5
bookmarks["label"] = -1
bookmarks
ratings = pd.read_csv(
os.path.join(DATA_PATH, 'ratings.csv'),
dtype={
'element_uid': np.uint16,
'user_uid': np.uint32,
'ts': np.float64,
'rating': np.uint8
}
)
ratings[["user_uid", "element_uid", "ts", "rating"]].to_pickle("ratings.pkl")
ratings = ratings.merge(catalog, left_on="element_uid", right_index=True, how="left")
ratings["consumption_mode"] = "R"
# ratings = ratings[ratings.rating > 4]
ratings["percent_watched"] = (ratings.rating + 2) / 10
ratings["label"] = 1
ratings.drop(columns=["rating"], inplace=True)
ratings
transactions intersect bookmarks 240k times
Некоторые фильмы пользователь мог посмотреть несколько раз.
Неизвестно сколько серий в каждом из сериалов и частей в многосерийном фильме.
transactions["percent_watched"] = transactions["percent_watched"].astype('float')
transactions
transactions.percent_watched[transactions.type == "movie"] = \
transactions.percent_watched[transactions.type == "movie"].clip(0, 4)
series_q75 = transactions[(transactions.type == "series") & \
(transactions.duration != 1)].groupby("element_uid")["percent_watched"].quantile(0.5)
series_q75 = dict(series_q75)
def func(x):
return min(x.percent_watched / series_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "series"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "series"), ["percent_watched", "element_uid"]].apply(func, axis=1)
multipart_q75 = transactions[(transactions.type == \
"multipart_movie")].groupby("element_uid")["percent_watched"].quantile(0.2)
def func(x):
return min(x.percent_watched / multipart_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched", "element_uid"]].apply(func, axis=1)
Пользователь потребил контент если он
transactions["first"] = transactions.consumption_mode.isin(["P", "R"])
transactions["second"] = (transactions.percent_watched > 0.5) & (transactions.type != "series")
transactions["third"] = (transactions.percent_watched > 1/3) & (transactions.type == "series")
transactions["label"] = (transactions["first"] | transactions.second | transactions.third).astype(int)
print(transactions.label.mean())
transactions.drop(columns=["first", "second", "third"], inplace=True)
Объединим все интерекции
transactions = transactions.append(bookmarks)
transactions = transactions.append(ratings)
Составим агрегированный рейтинг
transactions["rating"] = 0
transactions.rating.loc[transactions.type == "movie"] = \
transactions.percent_watched.loc[transactions.type == "movie"] * 5
transactions.rating.loc[transactions.type == "multipart_movie"] = \
transactions.percent_watched.loc[transactions.type == "multipart_movie"] * 5
transactions.rating.loc[transactions.type == "series"] = \
transactions.percent_watched.loc[transactions.type == "series"] * 10
transactions.loc[transactions.consumption_mode.isin(["P", "R"]), ["rating"]] = 15
Выкинем из выборки неактивных пользователей
user_cnt = transactions.user_uid.value_counts()
user_good = list(user_cnt[user_cnt >= 3].index)
transactions = transactions[transactions.user_uid.isin(user_good)]
len(user_cnt), len(user_good)
transactions.set_index(["element_uid", "user_uid"], inplace=True)
transactions = transactions[["device_type", "device_manufacturer", "feature_1", "feature_2", "feature_3",
"feature_4", "feature_5", "type", "purchase", "rent", "subscription", "label",
"rating", "ts"]]
transactions
transactions.to_pickle(os.path.join(DATA_PATH, "transactions.pkl"))
Сохраним транзакции для оставшихся в выборке пользователей
bookmarks[bookmarks.user_uid.isin(user_good)][["user_uid",
"element_uid",
"ts"]].to_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))