-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathals.py
148 lines (118 loc) · 4.54 KB
/
als.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Perform ALS collaborative filtering on the cars/dogs dataset."""
from collections import defaultdict
import numpy
import scipy
from implicit.als import AlternatingLeastSquares
from carroperro import util
from carroperro.cars import Cars
from carroperro.dogs import Dogs
from carroperro import counting
class ALS(object):
"""Builds ALS model from coocurrence data
Attributes:
cars (Cars): car names and hastags
dogs (Dogs): dog names and hastags
counts: dict of dict showing coocurrence counts
car_lookup (dict): mapping from ids to car names and vice versa
dog_lookup (dict): mapping from ids to dog names and vice versa
"""
def __init__(self, cooc_path):
"""
Args:
cooc_path (str): the path for the CSV coocurrence count file
"""
self.cars = Cars()
self.dogs = Dogs()
self.counts = None
self.car_lookup = None
self.dog_lookup = None
self.matrix = None
self.model = None
self.counts = counting.read_count_csv(cooc_path)
self.lookups_from_counts()
self.matrix_from_counts()
self.model_from_matrix(5)
def recommend_cars(self, dog_ids, k=5):
"""Recommend cars for a user whose profile is comprised of a list of
dog ids that user has expressed a liking for. Recommendations are a
achieved by finding the top cars for each dog, summing their scores,
and returning the top n cars from that scored collection.
Args:
dog_ids (list): of int, the dogs the user likes
n (int): the number of cars to recommend
Returns:
list: of 2-tuples, (car: str, score: float)
"""
scored_cars = defaultdict(float)
for dog_id in dog_ids:
for car_id, score in self.model.recommend(
dog_id, self.matrix.tocsr(),
):
scored_cars[car_id] += score
sorted_cars = sorted(
scored_cars.items(), key=lambda x: x[1],
reverse=True,
)
normalized_cars = [
{
'car': int(car_id),
'score': float(score) / len(dog_ids),
}
for car_id, score in sorted_cars
]
return normalized_cars[:k]
def name_from_hashtag(self, hashtag):
"""Find the full canonical name for a car or a dog from its hashtag.
Note this is not safe if car and dog hashtags can collide (e.g.
if there is a car model that is the same string as a dog breed), but
this is not the case in the data used.
Args:
hashtag: string, the hastag (without #)
Returns:
string: either 'car', or 'dog', depending on which type was found
string: the name recovered
Raises:
ValueError: if there is neither a dog nor a car matching that
hashtag
"""
car_name = self.cars.name_from_hashtag(hashtag)
if car_name:
return 'car', car_name
dog_name = self.dogs.name_from_hashtag(hashtag)
if dog_name:
return 'dog', dog_name
raise ValueError('No name found for hashtag: ' + hashtag)
def lookups_from_counts(self):
"""Build bi-directional lookup tables from car/dog names -> ids and
ids -> car/dog names, using all cars and dogs in the counts.
"""
cars = self.counts.keys()
self.car_lookup = util.two_way_index(sorted(cars))
dogs = list(set(
[dog for dog_counts in self.counts.values() for dog in dog_counts],
))
self.dog_lookup = util.two_way_index(sorted(dogs))
def matrix_from_counts(self):
"""Build a scipy.sparse.coo_matrix from the counts"""
cars = []
dogs = []
counts = []
for car, dog_counts in self.counts.items():
for dog, count in dog_counts.items():
cars.append(self.car_lookup[car])
dogs.append(self.dog_lookup[dog])
counts.append(count)
row = numpy.array(cars)
col = numpy.array(dogs)
data = numpy.array(counts)
self.matrix = scipy.sparse.coo_matrix((data, (row, col)))
def model_from_matrix(self, factors):
"""Fit an ALS model on the matrix
Args:
factors (int): the number of latent factors to use
"""
self.model = AlternatingLeastSquares(
factors=factors, dtype=numpy.float32,
use_gpu=False,
)
self.model.fit(self.matrix)