Импорт облака:
from google.colab import drive
drive.mount('/content/gdrive')
Импорт библиотек:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import pairwise_distances
Загрузка данных:
movies = pd.read_csv('/content/gdrive/MyDrive/recomend/movies.csv')
ratings = pd.read_csv('/content/gdrive/MyDrive/recomend/ratings.csv')
Подсчёт общего количества пользователей и предметов:
n_users = len(ratings['userId'].unique())
n_movies = len(ratings['movieId'].unique())
Приведение movie_id в пригодный вид (от 0 до n_movies):
movie_ids = ratings['movieId'].unique()
def scale_movie_id(movie_id):
scaled = np.where(movie_ids == movie_id)[0][0] + 1
return scaled
ratings['movieId'] = ratings['movieId'].apply(scale_movie_id)
P.S. На этом этапе можно отнормировать значения рейтингов
Разделение данных на выборки:
train_data, test_data = train_test_split(ratings, test_size=0.2)
Создание метрики:
def rmse(prediction, ground_truth):
# Оставим оценки, предсказанные алгоритмом, только для соотвествующего набора данных
prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
# Оставим оценки, которые реально поставил пользователь, только для соотвествующего набора данных
ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
mse = mean_squared_error(prediction, ground_truth)
return sqrt(mse)
Преобразование данных в матрицы:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
Нахождение косинусных растояний:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
Предикт:
def k_fract_mean_predict(top):
-top_similar = np.zeros((n_users, top))
-for i in range(n_users):
--user_sim = user_similarity[i]
--top_sim_users = user_sim.argsort()[1:top + 1]
--for j in range(top):
---top_similar[i, j] = top_sim_users[j]
-abs_sim = np.abs(user_similarity)
-pred = np.zeros((n_users, n_movies))
-for i in range(n_users):
--indexes = top_similar[i].astype(np.int)
--numerator = user_similarity[i][indexes]
--mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
--diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean()
--numerator = numerator.dot(diff_ratings)
--denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
--pred[i] = np.round(mean_rating + numerator / denominator, 1)
-return pred
def k_fract_mean_predict_item(top):
-top_similar = np.zeros((n_movies, top))
-for i in range(n_movies):
--movie_sim = item_similarity[i]
--top_sim_movies = movie_sim.argsort()[1:top + 1]
--for j in range(top):
---top_similar[i, j] = top_sim_movies[j]
-abs_sim = np.abs(item_similarity)
-pred = np.zeros((n_movies, n_users))
-for i in range(n_movies):
--indexes = top_similar[i].astype(np.int)
--numerator = item_similarity[i][indexes]
--diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
--numerator = numerator.dot(diff_ratings)
--denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
--denominator = denominator if denominator != 0 else 1
--mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
--mean_rating = 0 if np.isnan(mean_rating) else mean_rating
--pred[i] = np.round(mean_rating + numerator / denominator)
-return pred.T
k_predict = k_fract_mean_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))
k_predict_item = k_fract_mean_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))
Извлечение рекомендаций и их загрузка в Series:
num_rec = 5
d = dict()
for i in range(k_predict.shape[0]):
-d[i] = list(np.argpartition(k_predict[i], -num_rec)[-num_rec:])
pred_data = pd.Series(d)