forked from manasRK/word2vec-recommender
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrecommender_context.py
99 lines (80 loc) · 3.74 KB
/
recommender_context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Manas Ranjan Kar
# @Date: 2016-06-21
# @Email: [email protected]
# @Github username: @manasRK
# @Last Modified by: Manas Ranjan KAr
# @Last Modified time: 2016-06-26
import os
import sys
import ast
import gensim
import json
from gensim import utils
from gensim.models import Word2Vec
import logging
import argparse
logger = logging.getLogger(__name__)
import redis
data_obj = redis.Redis("localhost", port=6379, db=2)
class ContextCorpus(object):
def __init__(self, redis_obj, context_prefix='C', example_prefix='I'):
"""
ContextCorpus(contexts)
Parameters
---------
contexts : dict of sets
Each set in the dict should be an observation occurring within
the same context. E.g., every set should contain all pins by
one client or all fixes by one client.
context_prefix : str, optional
This string will be prepended to all context keys
example_prefix : str, optional
This string will be prepended to all non-context words
Examples
------
# If user 1 bought items 0, 1, 2, 3 and user 2
# bought 3, 4, 5 and user bought item 10
contexts = {1:{0,1,2,3}, 2:{3,4,5}, 3:{10}}
cc = ContextCorpus(contexts)
"""
self.redis_obj = redis_obj
self.context_prefix = context_prefix
self.example_prefix = example_prefix
def __iter__(self):
"""Create 'sentences' that start with the context as a word
and then also have items as words on the same line """
for key in self.redis_obj.keys("*"):
line = [self.context_prefix + str(key)]
line += [self.example_prefix + str(i) for i in ast.literal_eval(self.redis_obj.get(key))]
yield line
def train(model_file):
contexts = ContextCorpus(data_obj)
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5
model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6
# ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
model.init_sims(replace=True)
model.save(model_file)
if __name__ == "__main__":
logging.root.setLevel(level=logging.INFO)
logger.info("running %s", ' '.join(sys.argv))
# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model_output", required=True,
help="Output model file, in word2vec format")
args = parser.parse_args()
model_file = args.model_output
logger.info('Picking data from %s...', data_obj)
train(model_file)
logger.info("Saving model file %s in %s", model_file, os.path.abspath(model_file))
logger.info("Finished running %s", program)