Подготовим агрегированный рейтинг по всем интеракциям: транзакциям, добавлениям в закладки, проставлению рейтинга.

In [1]:
import pandas as pd
import os
import numpy as np
import json
import math
from tqdm import tqdm
from scipy import sparse as sp

from sklearn.model_selection import train_test_split

%matplotlib inline
In [2]:
DATA_PATH = "rekko_sand_rekko"

Catalog

In [37]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()

for feature in ["purchase", "rent", "subscription"]:
    catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)

catalog.duration += 5
In [38]:
catalog
Out[38]:
type duration feature_1 feature_2 feature_3 feature_4 feature_5 purchase rent subscription
1983 movie 145 1.65722e+06 0.75361 39 1.11941 0 1 1 1
3783 movie 115 3.55652e+07 0.766254 41 1.1386 0.654707 1 1 1
5208 movie 95 1.32707e+07 0.765425 27 1.13181 0.592716 1 1 1
9744 movie 125 2.17499e+07 0.757874 26 1.13353 0.654707 1 1 1
1912 movie 115 9.21296e+06 0.759566 7 1.11013 0.654707 1 1 0
... ... ... ... ... ... ... ... ... ... ...
6643 series 55 4.25704e+07 0.766254 9 1.13353 0.654707 1 0 1
166 series 35 4.30677e+07 0.692949 7 1.14193 0.68041 0 0 1
9242 series 45 2.54234e+07 0.65145 3 1.13523 0.654707 1 0 0
6427 series 35 1.34526e+07 0.766254 16 1.11941 -1 0 0 1
8452 series 45 3.84502e+07 0.692949 11 1.14027 0.654707 0 0 1

10200 rows × 10 columns

Transactions

In [39]:
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)
In [40]:
transactions
Out[40]:
element_uid user_uid consumption_mode ts watched_time device_type device_manufacturer
0 3336 5177 S 4.430518e+07 4282 0 50
1 481 593316 S 4.430518e+07 2989 0 11
2 4128 262355 S 4.430518e+07 833 0 50
3 6272 74296 S 4.430518e+07 2530 0 99
4 5543 340623 P 4.430518e+07 6282 0 50
... ... ... ... ... ... ... ...
9643007 2252 180823 S 4.173063e+07 2503 0 11
9643008 8436 458827 S 4.173063e+07 8360 0 50
9643009 8888 50431 S 4.173063e+07 5763 0 11
9643010 6099 59148 S 4.173063e+07 6831 0 50
9643011 6189 283774 S 4.173063e+07 19586 0 11

9643012 rows × 7 columns

In [41]:
transactions.watched_time = transactions.watched_time / 60
transactions = transactions.merge(catalog, left_on="element_uid", right_index=True, how="left")

transactions["percent_watched"] = transactions.watched_time / transactions.duration
In [42]:
transactions
Out[42]:
element_uid user_uid consumption_mode ts watched_time device_type device_manufacturer type duration feature_1 feature_2 feature_3 feature_4 feature_5 purchase rent subscription percent_watched
0 3336 5177 S 4.430518e+07 71.366667 0 50 movie 95 4.16611e+07 0.739609 45 1.14193 0.654707 1 1 1 0.751228
1 481 593316 S 4.430518e+07 49.816667 0 11 movie 55 4.29342e+07 0.750161 11 1.11941 0.592716 1 0 1 0.905758
2 4128 262355 S 4.430518e+07 13.883333 0 50 movie 105 2.77773e+07 0.750161 12 1.13008 0.654707 1 1 1 0.132222
3 6272 74296 S 4.430518e+07 42.166667 0 99 movie 105 4.04156e+07 0.675218 34 1.14027 0.68041 1 1 1 0.401587
4 5543 340623 P 4.430518e+07 104.700000 0 50 movie 75 9.21296e+06 0.783234 14 1.11388 0 1 1 0 1.396
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9643007 2252 180823 S 4.173063e+07 41.716667 0 11 movie 105 4.01637e+07 0.680525 19 1.14027 0.654707 1 1 1 0.397302
9643008 8436 458827 S 4.173063e+07 139.333333 0 50 movie 95 3.91147e+07 0.699002 21 1.14027 0.654707 1 1 1 1.46667
9643009 8888 50431 S 4.173063e+07 96.050000 0 11 movie 115 3.62738e+07 0.646848 10 1.14027 0.68041 1 1 0 0.835217
9643010 6099 59148 S 4.173063e+07 113.850000 0 50 movie 125 2.96135e+07 0.754467 19 1.13692 0.592716 1 1 1 0.9108
9643011 6189 283774 S 4.173063e+07 326.433333 0 11 movie 125 5.98079e+06 0.758298 5 1.10437 0 1 1 0 2.61147

9643012 rows × 18 columns

Bookmarks

In [43]:
bookmarks = pd.read_csv(
    os.path.join(DATA_PATH, 'bookmarks.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64
    }
)
In [44]:
bookmarks = bookmarks.merge(catalog, left_on="element_uid", right_index=True, how="left")
bookmarks["consumption_mode"] = "B"
bookmarks["percent_watched"] = 0.5
bookmarks["label"] = -1
In [58]:
bookmarks
Out[58]:
user_uid element_uid ts type duration feature_1 feature_2 feature_3 feature_4 feature_5 purchase rent subscription consumption_mode percent_watched label
0 301135 7185 4.430516e+07 movie 115 2.91937e+07 0.744484 33 1.13692 0.592716 1 1 1 B 0.5 -1
1 301135 4083 4.430516e+07 movie 95 4.27383e+07 0.666559 22 1.14193 0.68041 1 1 1 B 0.5 -1
2 301135 10158 4.430516e+07 movie 115 3.9665e+07 0.722234 22 1.14027 0.68041 1 1 1 B 0.5 -1
3 301135 2693 4.430516e+07 movie 135 6.40055e+06 0.823858 21 1.09847 0.654707 1 1 1 B 0.5 -1
4 301135 2181 4.430515e+07 movie 115 3.49453e+07 0.668198 8 1.1386 0.654707 1 1 1 B 0.5 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
948211 524752 2557 4.173079e+07 movie 135 4.00098e+07 0.73557 16 1.14027 0.68041 1 1 0 B 0.5 -1
948212 524752 8919 4.173077e+07 movie 115 4.11851e+07 0.79027 20 1.14027 0.68041 1 1 0 B 0.5 -1
948213 5174 3637 4.173076e+07 series 55 4.03679e+07 0.692949 10 1.14193 0.68041 0 0 1 B 0.5 -1
948214 161137 9700 4.173076e+07 movie 155 4.60956e+06 0.800194 9 1.09445 0.592716 1 1 0 B 0.5 -1
948215 26252 8460 4.173068e+07 movie 135 4.26264e+07 0.759566 27 1.14193 0.592716 1 0 0 B 0.5 -1

948216 rows × 16 columns

Ratings

In [45]:
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)
In [46]:
ratings[["user_uid", "element_uid", "ts", "rating"]].to_pickle("ratings.pkl")
In [47]:
ratings = ratings.merge(catalog, left_on="element_uid", right_index=True, how="left")
ratings["consumption_mode"] = "R"
# ratings = ratings[ratings.rating > 4]
ratings["percent_watched"] = (ratings.rating + 2) / 10
ratings["label"] = 1
ratings.drop(columns=["rating"], inplace=True)
In [48]:
ratings
Out[48]:
user_uid element_uid ts type duration feature_1 feature_2 feature_3 feature_4 feature_5 purchase rent subscription consumption_mode percent_watched label
0 571252 1364 4.430517e+07 movie 135 1.67122e+06 0.795648 40 1.11574 0.592716 1 1 1 R 1.2 1
1 63140 3037 4.430514e+07 movie 195 6.34458e+06 0.806167 20 1.11574 0.592716 1 1 0 R 1.2 1
2 443817 4363 4.430514e+07 movie 115 6.68039e+06 0.755748 31 1.12833 0.592716 1 1 1 R 1.0 1
3 359870 1364 4.430506e+07 movie 135 1.67122e+06 0.795648 40 1.11574 0.592716 1 1 1 R 1.2 1
4 359870 3578 4.430506e+07 movie 105 9.05905e+06 0.783234 28 1.13008 0.654707 1 1 1 R 1.1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
438785 170707 1539 4.173086e+07 movie 95 4.13672e+07 0.698501 25 1.14193 0.592716 1 1 1 R 1.0 1
438786 32659 6127 4.173085e+07 movie 135 4.14658e+07 0.726889 29 1.14193 0.68041 1 1 1 R 1.0 1
438787 353752 3336 4.173079e+07 movie 95 4.16611e+07 0.739609 45 1.14193 0.654707 1 1 1 R 1.0 1
438788 492350 7984 4.173078e+07 movie 85 1.51133e+07 0.658805 13 1.12833 0.654707 1 1 1 R 0.9 1
438789 374752 8919 4.173065e+07 movie 115 4.11851e+07 0.79027 20 1.14027 0.68041 1 1 0 R 0.4 1

438790 rows × 16 columns

transactions intersect bookmarks 240k times

prepare label

Некоторые фильмы пользователь мог посмотреть несколько раз.
Неизвестно сколько серий в каждом из сериалов и частей в многосерийном фильме.

In [49]:
transactions["percent_watched"] = transactions["percent_watched"].astype('float')
In [59]:
transactions
Out[59]:
device_type device_manufacturer feature_1 feature_2 feature_3 feature_4 feature_5 type purchase rent subscription label rating ts
element_uid user_uid
3336 5177 0.0 50.0 4.16611e+07 0.739609 45 1.14193 0.654707 movie 1 1 1 1 3.756140 4.430518e+07
481 593316 0.0 11.0 4.29342e+07 0.750161 11 1.11941 0.592716 movie 1 0 1 1 4.528788 4.430518e+07
4128 262355 0.0 50.0 2.77773e+07 0.750161 12 1.13008 0.654707 movie 1 1 1 0 0.661111 4.430518e+07
6272 74296 0.0 99.0 4.04156e+07 0.675218 34 1.14027 0.68041 movie 1 1 1 0 2.007937 4.430518e+07
5543 340623 0.0 50.0 9.21296e+06 0.783234 14 1.11388 0 movie 1 1 0 1 15.000000 4.430518e+07
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1539 170707 NaN NaN 4.13672e+07 0.698501 25 1.14193 0.592716 movie 1 1 1 1 15.000000 4.173086e+07
6127 32659 NaN NaN 4.14658e+07 0.726889 29 1.14193 0.68041 movie 1 1 1 1 15.000000 4.173085e+07
3336 353752 NaN NaN 4.16611e+07 0.739609 45 1.14193 0.654707 movie 1 1 1 1 15.000000 4.173079e+07
7984 492350 NaN NaN 1.51133e+07 0.658805 13 1.12833 0.654707 movie 1 1 1 1 15.000000 4.173078e+07
8919 374752 NaN NaN 4.11851e+07 0.79027 20 1.14027 0.68041 movie 1 1 0 1 15.000000 4.173065e+07

11030018 rows × 14 columns

In [50]:
transactions.percent_watched[transactions.type == "movie"] = \
transactions.percent_watched[transactions.type == "movie"].clip(0, 4)

series_q75 = transactions[(transactions.type == "series") & \
             (transactions.duration != 1)].groupby("element_uid")["percent_watched"].quantile(0.5)
series_q75 = dict(series_q75)
def func(x):
    return min(x.percent_watched / series_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "series"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "series"), ["percent_watched", "element_uid"]].apply(func, axis=1)

multipart_q75 = transactions[(transactions.type == \
                              "multipart_movie")].groupby("element_uid")["percent_watched"].quantile(0.2)

def func(x):
    return min(x.percent_watched / multipart_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched", "element_uid"]].apply(func, axis=1)
/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: RuntimeWarning: divide by zero encountered in double_scalars
  app.launch_new_instance()
/opt/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: RuntimeWarning: invalid value encountered in double_scalars
  app.launch_new_instance()

Пользователь потребил контент если он

  • Посмотрел больше половины фильма
  • Больше трети сериала
  • Купил или взял в аренду
In [51]:
transactions["first"] = transactions.consumption_mode.isin(["P", "R"])
transactions["second"] = (transactions.percent_watched > 0.5) & (transactions.type != "series")
transactions["third"] = (transactions.percent_watched > 1/3) & (transactions.type == "series")
transactions["label"] = (transactions["first"] | transactions.second | transactions.third).astype(int)
print(transactions.label.mean())
transactions.drop(columns=["first", "second", "third"], inplace=True)
0.675946270729519

Объединим все интерекции

In [52]:
transactions = transactions.append(bookmarks)
transactions = transactions.append(ratings)

Составим агрегированный рейтинг

In [53]:
transactions["rating"] = 0
transactions.rating.loc[transactions.type == "movie"] = \
transactions.percent_watched.loc[transactions.type == "movie"] * 5

transactions.rating.loc[transactions.type == "multipart_movie"] = \
transactions.percent_watched.loc[transactions.type == "multipart_movie"] * 5

transactions.rating.loc[transactions.type == "series"] = \
transactions.percent_watched.loc[transactions.type == "series"] * 10

transactions.loc[transactions.consumption_mode.isin(["P", "R"]), ["rating"]] = 15
/opt/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)

Выкинем из выборки неактивных пользователей

In [54]:
user_cnt = transactions.user_uid.value_counts()
user_good = list(user_cnt[user_cnt >= 3].index)
transactions = transactions[transactions.user_uid.isin(user_good)]
len(user_cnt), len(user_good)
Out[54]:
(500000, 500000)
In [55]:
transactions.set_index(["element_uid", "user_uid"], inplace=True)

transactions = transactions[["device_type", "device_manufacturer", "feature_1", "feature_2", "feature_3",
                             "feature_4", "feature_5", "type", "purchase", "rent", "subscription", "label",
                            "rating", "ts"]]
In [60]:
transactions
Out[60]:
device_type device_manufacturer feature_1 feature_2 feature_3 feature_4 feature_5 type purchase rent subscription label rating ts
element_uid user_uid
3336 5177 0.0 50.0 4.16611e+07 0.739609 45 1.14193 0.654707 movie 1 1 1 1 3.756140 4.430518e+07
481 593316 0.0 11.0 4.29342e+07 0.750161 11 1.11941 0.592716 movie 1 0 1 1 4.528788 4.430518e+07
4128 262355 0.0 50.0 2.77773e+07 0.750161 12 1.13008 0.654707 movie 1 1 1 0 0.661111 4.430518e+07
6272 74296 0.0 99.0 4.04156e+07 0.675218 34 1.14027 0.68041 movie 1 1 1 0 2.007937 4.430518e+07
5543 340623 0.0 50.0 9.21296e+06 0.783234 14 1.11388 0 movie 1 1 0 1 15.000000 4.430518e+07
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1539 170707 NaN NaN 4.13672e+07 0.698501 25 1.14193 0.592716 movie 1 1 1 1 15.000000 4.173086e+07
6127 32659 NaN NaN 4.14658e+07 0.726889 29 1.14193 0.68041 movie 1 1 1 1 15.000000 4.173085e+07
3336 353752 NaN NaN 4.16611e+07 0.739609 45 1.14193 0.654707 movie 1 1 1 1 15.000000 4.173079e+07
7984 492350 NaN NaN 1.51133e+07 0.658805 13 1.12833 0.654707 movie 1 1 1 1 15.000000 4.173078e+07
8919 374752 NaN NaN 4.11851e+07 0.79027 20 1.14027 0.68041 movie 1 1 0 1 15.000000 4.173065e+07

11030018 rows × 14 columns

In [56]:
transactions.to_pickle(os.path.join(DATA_PATH, "transactions.pkl"))

Сохраним транзакции для оставшихся в выборке пользователей

In [57]:
bookmarks[bookmarks.user_uid.isin(user_good)][["user_uid",
                                               "element_uid",
                                               "ts"]].to_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))
In [ ]: