Подготовим агрегированный рейтинг по всем интеракциям: транзакциям, добавлениям в закладки, проставлению рейтинга.¶

import pandas as pd
import os
import numpy as np
import json
import math
from tqdm import tqdm
from scipy import sparse as sp

from sklearn.model_selection import train_test_split

%matplotlib inline

DATA_PATH = "rekko_sand_rekko"

Catalog¶

with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()

for feature in ["purchase", "rent", "subscription"]:
    catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)

catalog.duration += 5

catalog

Transactions¶

transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

transactions

transactions.watched_time = transactions.watched_time / 60
transactions = transactions.merge(catalog, left_on="element_uid", right_index=True, how="left")

transactions["percent_watched"] = transactions.watched_time / transactions.duration

transactions

Bookmarks¶

bookmarks = pd.read_csv(
    os.path.join(DATA_PATH, 'bookmarks.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64
    }
)

bookmarks = bookmarks.merge(catalog, left_on="element_uid", right_index=True, how="left")
bookmarks["consumption_mode"] = "B"
bookmarks["percent_watched"] = 0.5
bookmarks["label"] = -1

bookmarks

Ratings¶

ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)

ratings[["user_uid", "element_uid", "ts", "rating"]].to_pickle("ratings.pkl")

ratings = ratings.merge(catalog, left_on="element_uid", right_index=True, how="left")
ratings["consumption_mode"] = "R"
# ratings = ratings[ratings.rating > 4]
ratings["percent_watched"] = (ratings.rating + 2) / 10
ratings["label"] = 1
ratings.drop(columns=["rating"], inplace=True)

ratings

transactions intersect bookmarks 240k times

prepare label¶

Некоторые фильмы пользователь мог посмотреть несколько раз.
Неизвестно сколько серий в каждом из сериалов и частей в многосерийном фильме.

transactions["percent_watched"] = transactions["percent_watched"].astype('float')

transactions

transactions.percent_watched[transactions.type == "movie"] = \
transactions.percent_watched[transactions.type == "movie"].clip(0, 4)

series_q75 = transactions[(transactions.type == "series") & \
             (transactions.duration != 1)].groupby("element_uid")["percent_watched"].quantile(0.5)
series_q75 = dict(series_q75)
def func(x):
    return min(x.percent_watched / series_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "series"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "series"), ["percent_watched", "element_uid"]].apply(func, axis=1)

multipart_q75 = transactions[(transactions.type == \
                              "multipart_movie")].groupby("element_uid")["percent_watched"].quantile(0.2)

def func(x):
    return min(x.percent_watched / multipart_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched", "element_uid"]].apply(func, axis=1)

/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: RuntimeWarning: divide by zero encountered in double_scalars
  app.launch_new_instance()
/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: RuntimeWarning: invalid value encountered in double_scalars
  app.launch_new_instance()

Пользователь потребил контент если он

Посмотрел больше половины фильма
Больше трети сериала
Купил или взял в аренду

transactions["first"] = transactions.consumption_mode.isin(["P", "R"])
transactions["second"] = (transactions.percent_watched > 0.5) & (transactions.type != "series")
transactions["third"] = (transactions.percent_watched > 1/3) & (transactions.type == "series")
transactions["label"] = (transactions["first"] | transactions.second | transactions.third).astype(int)
print(transactions.label.mean())
transactions.drop(columns=["first", "second", "third"], inplace=True)

0.675946270729519

Объединим все интерекции

transactions = transactions.append(bookmarks)
transactions = transactions.append(ratings)

Составим агрегированный рейтинг

transactions["rating"] = 0
transactions.rating.loc[transactions.type == "movie"] = \
transactions.percent_watched.loc[transactions.type == "movie"] * 5

transactions.rating.loc[transactions.type == "multipart_movie"] = \
transactions.percent_watched.loc[transactions.type == "multipart_movie"] * 5

transactions.rating.loc[transactions.type == "series"] = \
transactions.percent_watched.loc[transactions.type == "series"] * 10

transactions.loc[transactions.consumption_mode.isin(["P", "R"]), ["rating"]] = 15

/opt/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)

Выкинем из выборки неактивных пользователей

user_cnt = transactions.user_uid.value_counts()
user_good = list(user_cnt[user_cnt >= 3].index)
transactions = transactions[transactions.user_uid.isin(user_good)]
len(user_cnt), len(user_good)

(500000, 500000)

transactions.set_index(["element_uid", "user_uid"], inplace=True)

transactions = transactions[["device_type", "device_manufacturer", "feature_1", "feature_2", "feature_3",
                             "feature_4", "feature_5", "type", "purchase", "rent", "subscription", "label",
                            "rating", "ts"]]

transactions

transactions.to_pickle(os.path.join(DATA_PATH, "transactions.pkl"))

Сохраним транзакции для оставшихся в выборке пользователей

bookmarks[bookmarks.user_uid.isin(user_good)][["user_uid",
                                               "element_uid",
                                               "ts"]].to_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))

	type	duration	feature_1	feature_2	feature_3	feature_4	feature_5	purchase	rent	subscription
1983	movie	145	1.65722e+06	0.75361	39	1.11941	0	1	1	1
3783	movie	115	3.55652e+07	0.766254	41	1.1386	0.654707	1	1	1
5208	movie	95	1.32707e+07	0.765425	27	1.13181	0.592716	1	1	1
9744	movie	125	2.17499e+07	0.757874	26	1.13353	0.654707	1	1	1
1912	movie	115	9.21296e+06	0.759566	7	1.11013	0.654707	1	1	0
...	...	...	...	...	...	...	...	...	...	...
6643	series	55	4.25704e+07	0.766254	9	1.13353	0.654707	1	0	1
166	series	35	4.30677e+07	0.692949	7	1.14193	0.68041	0	0	1
9242	series	45	2.54234e+07	0.65145	3	1.13523	0.654707	1	0	0
6427	series	35	1.34526e+07	0.766254	16	1.11941	-1	0	0	1
8452	series	45	3.84502e+07	0.692949	11	1.14027	0.654707	0	0	1

	element_uid	user_uid	consumption_mode	ts	watched_time	device_type	device_manufacturer
0	3336	5177	S	4.430518e+07	4282	0	50
1	481	593316	S	4.430518e+07	2989	0	11
2	4128	262355	S	4.430518e+07	833	0	50
3	6272	74296	S	4.430518e+07	2530	0	99
4	5543	340623	P	4.430518e+07	6282	0	50
...	...	...	...	...	...	...	...
9643007	2252	180823	S	4.173063e+07	2503	0	11
9643008	8436	458827	S	4.173063e+07	8360	0	50
9643009	8888	50431	S	4.173063e+07	5763	0	11
9643010	6099	59148	S	4.173063e+07	6831	0	50
9643011	6189	283774	S	4.173063e+07	19586	0	11

	element_uid	user_uid	consumption_mode	ts	watched_time	device_type	device_manufacturer	type	duration	feature_1	feature_2	feature_3	feature_4	feature_5	purchase	rent	subscription	percent_watched
0	3336	5177	S	4.430518e+07	71.366667	0	50	movie	95	4.16611e+07	0.739609	45	1.14193	0.654707	1	1	1	0.751228
1	481	593316	S	4.430518e+07	49.816667	0	11	movie	55	4.29342e+07	0.750161	11	1.11941	0.592716	1	0	1	0.905758
2	4128	262355	S	4.430518e+07	13.883333	0	50	movie	105	2.77773e+07	0.750161	12	1.13008	0.654707	1	1	1	0.132222
3	6272	74296	S	4.430518e+07	42.166667	0	99	movie	105	4.04156e+07	0.675218	34	1.14027	0.68041	1	1	1	0.401587
4	5543	340623	P	4.430518e+07	104.700000	0	50	movie	75	9.21296e+06	0.783234	14	1.11388	0	1	1	0	1.396
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9643007	2252	180823	S	4.173063e+07	41.716667	0	11	movie	105	4.01637e+07	0.680525	19	1.14027	0.654707	1	1	1	0.397302
9643008	8436	458827	S	4.173063e+07	139.333333	0	50	movie	95	3.91147e+07	0.699002	21	1.14027	0.654707	1	1	1	1.46667
9643009	8888	50431	S	4.173063e+07	96.050000	0	11	movie	115	3.62738e+07	0.646848	10	1.14027	0.68041	1	1	0	0.835217
9643010	6099	59148	S	4.173063e+07	113.850000	0	50	movie	125	2.96135e+07	0.754467	19	1.13692	0.592716	1	1	1	0.9108
9643011	6189	283774	S	4.173063e+07	326.433333	0	11	movie	125	5.98079e+06	0.758298	5	1.10437	0	1	1	0	2.61147

	user_uid	element_uid	ts	type	duration	feature_1	feature_2	feature_3	feature_4	feature_5	purchase	rent	subscription	consumption_mode	percent_watched	label
0	301135	7185	4.430516e+07	movie	115	2.91937e+07	0.744484	33	1.13692	0.592716	1	1	1	B	0.5	-1
1	301135	4083	4.430516e+07	movie	95	4.27383e+07	0.666559	22	1.14193	0.68041	1	1	1	B	0.5	-1
2	301135	10158	4.430516e+07	movie	115	3.9665e+07	0.722234	22	1.14027	0.68041	1	1	1	B	0.5	-1
3	301135	2693	4.430516e+07	movie	135	6.40055e+06	0.823858	21	1.09847	0.654707	1	1	1	B	0.5	-1
4	301135	2181	4.430515e+07	movie	115	3.49453e+07	0.668198	8	1.1386	0.654707	1	1	1	B	0.5	-1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
948211	524752	2557	4.173079e+07	movie	135	4.00098e+07	0.73557	16	1.14027	0.68041	1	1	0	B	0.5	-1
948212	524752	8919	4.173077e+07	movie	115	4.11851e+07	0.79027	20	1.14027	0.68041	1	1	0	B	0.5	-1
948213	5174	3637	4.173076e+07	series	55	4.03679e+07	0.692949	10	1.14193	0.68041	0	0	1	B	0.5	-1
948214	161137	9700	4.173076e+07	movie	155	4.60956e+06	0.800194	9	1.09445	0.592716	1	1	0	B	0.5	-1
948215	26252	8460	4.173068e+07	movie	135	4.26264e+07	0.759566	27	1.14193	0.592716	1	0	0	B	0.5	-1

	user_uid	element_uid	ts	type	duration	feature_1	feature_2	feature_3	feature_4	feature_5	purchase	rent	subscription	consumption_mode	percent_watched	label
0	571252	1364	4.430517e+07	movie	135	1.67122e+06	0.795648	40	1.11574	0.592716	1	1	1	R	1.2	1
1	63140	3037	4.430514e+07	movie	195	6.34458e+06	0.806167	20	1.11574	0.592716	1	1	0	R	1.2	1
2	443817	4363	4.430514e+07	movie	115	6.68039e+06	0.755748	31	1.12833	0.592716	1	1	1	R	1.0	1
3	359870	1364	4.430506e+07	movie	135	1.67122e+06	0.795648	40	1.11574	0.592716	1	1	1	R	1.2	1
4	359870	3578	4.430506e+07	movie	105	9.05905e+06	0.783234	28	1.13008	0.654707	1	1	1	R	1.1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
438785	170707	1539	4.173086e+07	movie	95	4.13672e+07	0.698501	25	1.14193	0.592716	1	1	1	R	1.0	1
438786	32659	6127	4.173085e+07	movie	135	4.14658e+07	0.726889	29	1.14193	0.68041	1	1	1	R	1.0	1
438787	353752	3336	4.173079e+07	movie	95	4.16611e+07	0.739609	45	1.14193	0.654707	1	1	1	R	1.0	1
438788	492350	7984	4.173078e+07	movie	85	1.51133e+07	0.658805	13	1.12833	0.654707	1	1	1	R	0.9	1
438789	374752	8919	4.173065e+07	movie	115	4.11851e+07	0.79027	20	1.14027	0.68041	1	1	0	R	0.4	1