-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1_wordcount_mapper.py
40 lines (31 loc) · 950 Bytes
/
1_wordcount_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/python2
# coding=utf-8
"""WORDCOUNT MAPPER
Read all files from standard input
Split and clean all words to map as list of key-values
Ki, Vi => List(Ki, Vi)
key = (word, doc_id)
value = 1
Input: stdin
Output: ((word, doc_id), 1)
"""
import sys
import os
import re
stopwords = open('utils/stopwords_en.txt').read().split("\n")
current_filename = ""
doc_id = 0
for line in sys.stdin:
# Set file ID as collection ID
file_name = os.getenv('map_input_file')
if current_filename != file_name:
current_filename = file_name
doc_id += 1
# Set to lowercase, remove punctuations and tokenize
line = line.lower().strip()
line = re.sub(r"[^\w\s]", "", line)
words = line.split()
for word in words:
any_digit = any(str.isdigit(c) for c in word)
if len(word) > 3 and not any_digit and word not in stopwords:
print '(("%s", %d), 1)' % (word, doc_id)