In lesson 4 of Udacity "Intro to hadoop and Map Reduce" there was inverted index exercise. You can find code below - Mapper:
import sys
import csv
import re
def mapper():
reader = csv.reader(sys.stdin, delimiter='\t')
delimiters = ['[',']','#','$','-','=','/',' ','\t','\n','.','!','?',':',';','\"','(',')','<','>',','];
regexPattern = '|'.join(map(re.escape, delimiters))
for line in reader:
#skip header..
if line[8]=="added_at":
continue;
node = line[0];
body = line[4];
words = re.split(regexPattern, body.lower(), 0)
for word in words:
if len(word)>0:
print word, '\t', node;
def main():
import StringIO
mapper()
if __name__ == "__main__":
main()
Reducer:
#!/usr/bin/python
import sys
def reducer():
oldKey = None
array = []
for line in sys.stdin:
data = line.strip().split("\t");
thisKey, word = data;
if oldKey and oldKey != thisKey:
print oldKey,'\t','\t'.join(array)
array=[]
oldKey=thisKey
array.append(word)
if oldKey:
print oldKey,'\t','\t'.join(array)
def main():
import StringIO
reducer()
if __name__ == "__main__":
main()
In final project there was exercise for creating Top 10 tags which required reading whole export.
If we want to find Top 10 contributors we apply this pattern and read whole file again. It is not efficient.
We could use slightly modified code and create index for posts or tags.
Mapper for user activity, reducer not modified:
#!/usr/bin/python
import sys
import csv
import re
def mapper():
reader = csv.reader(sys.stdin, delimiter='\t')
for line in reader:
#skip header..
if line[8]=="added_at":
continue;
user = line[3]
post = line[0]
print user,'\t',post
def main():
import StringIO
mapper()
if __name__ == "__main__":
main()
Given that we could count posts very fast:
#!/usr/bin/python
import sys
def mapper():
for line in sys.stdin:
data = line.strip().split("\t");
print str(len(data)-1).zfill(10),data[0]
def main():
import StringIO
mapper()
if __name__ == "__main__":
main()
Mapper adds leading zeros to make proper sorting, MR job sorts data, there is no reducer (identity) and as a result we have all the users with counted posts, sorted by count. Below you can find top 10 contributors:
0000000954 100008240
0000001015 100005156
0000001021 100008306
0000001064 100007518
0000001416 100008230
0000001419 100005396
0000001448 100000461
0000001494 100008518
0000001660 100008283
0000001793 100005361
0000001910 100001071
Same code applies for counting tags, but instead of users we emit tags.
Brak komentarzy:
Prześlij komentarz