Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Movielens20m dataset #1336

Merged
merged 49 commits into from
Dec 27, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
eabaebe
wip movielnes parsing
zkid18 Oct 26, 2021
3908dd2
stuck with format
zkid18 Oct 27, 2021
aee834b
train-test splitter
zkid18 Nov 7, 2021
0e759d0
add comments
zkid18 Nov 8, 2021
02318a7
fix MovieLens100k error
zkid18 Nov 8, 2021
8b42c24
remove tmp_data
zkid18 Nov 8, 2021
ea10abb
add tests
zkid18 Nov 20, 2021
0b2e5ac
add tests; add user/item filtering algorithm
zkid18 Nov 21, 2021
86a1bff
codestyle wip
zkid18 Nov 21, 2021
7aa388c
codestyle fix
zkid18 Nov 21, 2021
eb2e55c
movielines codestyle
zkid18 Nov 23, 2021
37564c2
movielnes codestyle
zkid18 Nov 23, 2021
dadc98c
movielnes codestyle
zkid18 Nov 23, 2021
109302c
merge with master
zkid18 Nov 23, 2021
fd9359e
fixed tests
zkid18 Nov 25, 2021
fa5331f
codestyle minors
zkid18 Nov 25, 2021
e1da558
codestyle minors
zkid18 Nov 25, 2021
04ace76
minor fixes
zkid18 Nov 25, 2021
1d9151e
codestyle minor fix
zkid18 Nov 26, 2021
b4643bb
fix movielens tests
zkid18 Nov 29, 2021
3652af8
minor codestyle fix
zkid18 Nov 29, 2021
b841131
upgarde pandas version in the requirements
zkid18 Nov 29, 2021
e1f57db
upgarde pandas version in the requirements
zkid18 Nov 29, 2021
e40494c
change pandas to 1.1.1
zkid18 Nov 29, 2021
b5e3c6c
change scipy requerements
zkid18 Dec 16, 2021
bf5d067
update changelog
zkid18 Dec 16, 2021
31d5432
changed data
zkid18 Dec 16, 2021
13c3566
change the serrialization for pytorch
zkid18 Dec 16, 2021
34b59f1
change scipy to 1.4.1
zkid18 Dec 16, 2021
cddb12d
changed pytorch version
zkid18 Dec 16, 2021
81c2e17
removed serrialization param
zkid18 Dec 16, 2021
c9d482c
torchvision 0.5.0
zkid18 Dec 16, 2021
69a3b18
update torch 1.6.0
zkid18 Dec 16, 2021
b2fe8f8
update torch 1.7.0
zkid18 Dec 16, 2021
3e415c0
updated torchvision 0.8.0
zkid18 Dec 16, 2021
3516bbe
add version validation
zkid18 Dec 20, 2021
fd49c75
version check
zkid18 Dec 20, 2021
72dbf32
check requerements
zkid18 Dec 20, 2021
13e3bf8
cganged torchvisionm version
zkid18 Dec 21, 2021
a4f0b07
Changelog
zkid18 Dec 21, 2021
4bae487
pandas server
zkid18 Dec 21, 2021
7b10d4d
changed init
zkid18 Dec 23, 2021
0f8d8de
changed init
zkid18 Dec 23, 2021
195b12e
change import logic
zkid18 Dec 23, 2021
d5c984e
codestyle
zkid18 Dec 24, 2021
4c2de99
removed parse
zkid18 Dec 24, 2021
c5c2fe2
removed parse
zkid18 Dec 24, 2021
bb65567
Update requirements-ml.txt
Scitator Dec 27, 2021
15d5c20
Update test_movielens_20m.py
Scitator Dec 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion catalyst/contrib/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from catalyst.contrib.datasets.cifar import CIFAR10, CIFAR100

if SETTINGS.ml_required:
from catalyst.contrib.datasets.movielens import MovieLens
from catalyst.contrib.datasets.movielens import MovieLens, MovieLens20M

if SETTINGS.cv_required:
from catalyst.contrib.datasets.cv import *
332 changes: 327 additions & 5 deletions catalyst/contrib/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import numpy as np
import scipy.sparse as sp

import pandas as pd


import torch
from torch.utils.data import Dataset

Expand Down Expand Up @@ -150,16 +153,16 @@ def _read_raw_movielens_data(self):
"""Return the raw lines of the train and test files."""
path = self.raw_folder

with open(path + "/ml-100k/ua.base") as datafile:
with open(path + "/ml-20m/ua.base") as datafile:
ua_base = datafile.read().split("\n")

with open(path + "/ml-100k/ua.test") as datafile:
with open(path + "/ml-20m/ua.test") as datafile:
ua_test = datafile.read().split("\n")

with open(path + "/ml-100k/u.item", encoding="ISO-8859-1") as datafile:
with open(path + "/ml-20m/u.item", encoding="ISO-8859-1") as datafile:
u_item = datafile.read().split("\n")

with open(path + "/ml-100k/u.genre") as datafile:
with open(path + "/ml-20,/u.genre") as datafile:
u_genre = datafile.read().split("\n")

return (ua_base, ua_test, u_item, u_genre)
Expand Down Expand Up @@ -259,4 +262,323 @@ def _fetch_movies(self):
torch.save(test, f)


__all__ = ["MovieLens"]
class MovieLens20M(Dataset):
"""
MovieLens data sets (ml-20m) were collected by the GroupLens Research Project
at the University of Minnesota.

This data set consists of:
* 20,000,263 ratings (1-5) and 465,564 tag applications from 138,493 users on 27,278 movies.
* Each user has rated at least 20 movies.
* Simple demographic info for the users
(age, gender, occupation, zip)

Users were selected at random for inclusion. All selected users had rated at least 20 movies.
No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in six files, genome-scores.csv, genome-tags.csv, links.csv, movies.csv, ratings.csv and tags.csv. More details about the contents and use of all these files follows.
This and other GroupLens data sets are publicly available for download at http://grouplens.org/datasets/.

The data was collected through the MovieLens web site
(movielens.umn.edu) between January 09, 1995 and March 31, 2015. This dataset was generated on October 17, 2016.

Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
* The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
* The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
* The user may not redistribute the data without separate permission.
* The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
* The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
* In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).


The data are contained in six files:
1. genome-scores.csv
2. genome-tags.csv
3. links.csv
4. movies.csv
5. ratings.csv
6. tags.csv

Ratings Data File Structure (ratings.csv)
All ratings are contained in the file ratings.csv. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
* userId,
* movieId,
* rating,
* timestamp

Tags Data File Structure (tags.csv)
* userId,
* movieId,
* tag,
*timestamp

Movies Data File Structure (movies.csv)
* movieId,
* title,
* genres

Movie titles are entered manually or imported from https://www.themoviedb.org/, and include the year of release in parentheses.
Errors and inconsistencies may exist in these titles.

Links Data File Structure (links.csv)
* movieId,
* imdbId,
* tmdbId

Tag Genome (genome-scores.csv and genome-tags.csv)
* movieId,
* tagId,
* relevance


If you have any further questions or comments, please contact GroupLens
<[email protected]>.
https://files.grouplens.org/datasets/movielens/ml-20m-README.html
"""

resources = (
"https://files.grouplens.org/datasets/movielens/ml-20m.zip",
"b2116463d890a6a9f1d9b66d91558ed2",
)
filename = "ml-20m.zip"

def __init__(self,
root,
train=True,
download=False,
min_rating=0.0,
min_items_per_user=5.0,
min_users_per_item=0.0,
n_rows=1000):
"""
Args:
root (string): Root directory of dataset where
``MovieLens/processed/training.pt``
and ``MovieLens/processed/test.pt`` exist.
train (bool, optional): If True, creates dataset from
``training.pt``, otherwise from ``test.pt``.
download (bool, optional): If true, downloads the dataset from
the internet and puts it in root directory. If dataset
is already downloaded, it is not downloaded again.
min_rating (float, optional): Minimum rating to include in
the interaction matrix
"""
if isinstance(root, torch._six.string_classes):
root = os.path.expanduser(root)

self.root = root
self.train = train
self.min_rating = min_rating
self.min_items_per_user = args.min_items_per_user
self.min_users_per_item = args.min_users_per_item
self.n_rows = n_rows

if download:
self._download()

# self._fetch_movies()

if not self._check_exists():
raise RuntimeError("Dataset not found. You can use download=True to download it")

if self.train:
data_file = self.training_file
else:
data_file = self.test_file

# self.data = torch.load(os.path.join(self.processed_folder, data_file))

def __getitem__(self, user_index):
"""Get item.

Args:
user_index (int): User index [0, 138493]

Returns:
tensor: (items) item's ranking for the user with index user_index
"""
return self.data[user_index]

def __len__(self):
"""The length of the loader"""
return self.dimensions[0]

@property
def raw_folder(self):
"""Create raw folder for data download"""
return os.path.join(self.root, self.__class__.__name__, "raw")

@property
def processed_folder(self):
"""Create the folder for the processed files"""
return os.path.join(self.root, self.__class__.__name__, "processed")

def _check_exists(self):
"""Check if the path for tarining and testing data exists in processed folder."""
return os.path.exists(
os.path.join(self.processed_folder, self.training_file)
) and os.path.exists(os.path.join(self.processed_folder, self.test_file))

def _download(self):
"""Download and extract files/"""
if self._check_exists():
return

os.makedirs(self.raw_folder, exist_ok=True)
os.makedirs(self.processed_folder, exist_ok=True)
url = self.resources[0]
md5 = self.resources[1]

download_and_extract_archive(
url=url,
download_root=self.raw_folder,
filename=self.filename,
md5=md5,
remove_finished=True,
)

def _read_raw_movielens_data(self):
"""Return the raw lines of the train and test files."""
path = self.raw_folder

movies = pd.read_csv(path+'ml-20m/movies.csv', nrows=self.nrows)
ratings = pd.read_csv(path+'ml-20m/ratings.csv', nrows=self.nrows))
genome_scores = pd.read_csv(path+'ml-20m/genome-scores.csv', nrows=self.nrows))
genome_tags = pd.read_csv(path+'ml-20m/genome-tags.csv', nrows=self.nrows))
tags = pd.read_csv(path+'ml-20m/tags.csv', nrows=self.nrows))

return (movies, ratings, genome_scores, genome_tags, tags)

def _build_interaction_matrix(self, rows, cols, data):
"""[WIP] Builds interaction matrix.

?? Need to decide the filter order
- rating
- items per user
- user per item

Args:
rows (int): rows of the oevrall dataset
cols (int): columns of the overall dataset
data (generator object): generator of
the data object

Returns:
interaction_matrix (torch.sparse.Float):
sparse user2item interaction matrix
"""

if user_id_map is None:
users = dataframe[user_col].unique()
user_id_map = {user: userid for userid, user in enumerate(users)}

if item_id_map is None:
items = dataframe[item_col].unique()
item_id_map = {item: itemid for itemid, item in enumerate(items)}

matrix_size = (len(user_id_map.keys()), len(item_id_map.keys()))

matrix_users = dataframe[user_col].map(user_id_map)
matrix_items = dataframe[item_col].map(item_id_map)
matrix_inters = dataframe[inter_col]

csr_matrix = coo_matrix((matrix_inters, (matrix_users, matrix_items)), shape=matrix_size).tocsr()

return csr_matrix, item_id_map, user_id_map


return ratings, usercount, itemcount

def _get_dimensions(rating, user_count=None, item_count=None):
"""Gets the dimensions of the raw dataset"""

if user_count and item_count:
raise ValueError("Both cannot be True")
if not user_count and not item_count:
raise ValueError("Both cannot be False")
if user_count is None and item_count is None:
raise ValueError("Both cannot be None"))

if user_count:
id = 'movieId'
if item_count:
id = 'userId'

cnt = rating[[id]].groupby(id, as_index=False).size()
return cnt

def _parse(self, data, name="ratings"):
"""Parses and pre-process the raw data. Substract one to shift to zero based indexing

Args:
data: raw data of the dataset

Returns:
Generator iterator for parsed data
"""
if name == "ratings":
ratings = ratings[ratings['rating'] > self.min_rating].sort_values(['userId', 'timestamp'])

if self.min_users_per_item:
user_cnt = self._get_dimensions(ratings, user_count=True)
ratings = ratings[ratings['movieId'].isin(itemcount.index[user_cnt >= self.min_users_per_item])]


if self.min_items_per_user:
item_cnt = self._get_dimensions(ratings, item_count=True)
ratings = ratings[ratings['userId'].isin(itemcount.index[item_cnt >= self.min_items_per_user])]

num_users, num_items = self._get_dimensions(ratings, item_count=True), self._get_dimensions(ratings, user_count=True)
return ratings, num_users, num_items

else:
ValueError("Only rating is availbale")

def _split(self, by="users", test_prop=0.2, ratings, num_users, num_items):
"""Split the rating into train and test
Split by users
"""
if by = "users":
unique_uid = num_users.index

idx_perm = np.random.permutation(num_users.size)
unique_uid = unique_uid[idx_perm]

n_users = unique_uid.size

train_users = unique_uid[:n_users]
test_users = unique_uid[n_users:]

trains_events = ratings.loc[ratings['userId'].isin(train_users)]
test_events = ratings.loc[ratings['userId'].isin(test_users)]

else:
ValueError("Only splitting by user is availbale")

def _fetch_movies(self):
"""
Fetch data and save in the pytorch format
1. Read the train/test data from raw archive
2. Parse train data
3. Parse test data
4. Save in the .pt with torch.save
"""

(movies, ratings, genome_scores, genome_tags, tags) = self._read_raw_movielens_data()

# TO-DO: add error handling
ratings, num_users, num_items = self._parse(ratings)
train_raw, test_raw = self._split(by="users", test_prop=0.2, ratings, num_users, num_items)

train = self._build_interaction_matrix(num_users, num_items, self._parse(train_raw))
test = self._build_interaction_matrix(num_users, num_items, self._parse(test_raw))
assert train.shape == test.shape

with open(os.path.join(self.processed_folder, self.training_file), "wb") as f:
torch.save(train, f)

with open(os.path.join(self.processed_folder, self.test_file), "wb") as f:
torch.save(test, f)


__all__ = ["MovieLens", "MovieLens20M"]
Loading