-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrank_urls.py
More file actions
executable file
·79 lines (73 loc) · 2.67 KB
/
rank_urls.py
File metadata and controls
executable file
·79 lines (73 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
from argparse import ArgumentParser
from collections import defaultdict
import shelve
import sys
argparser = ArgumentParser(description='''
Creates db/urlrank, by distributing user scores (db/userrank)
to the urls they mentioned (in db/slice).
''')
argparser.add_argument('-n', '--toprint', default=10, type=int,
help='how many urls to report to stdout')
argparser.add_argument('-e', '--endorsers', action='store_true',
help='report which twitter authors mentioned the url')
argparser.add_argument('-f', '--filter', default='twitter.com',
help='do not include urls containing a certain substring')
argparser.add_argument('-d', '--dump', action='store_true',
help='for each user, all urls they mention')
def main():
args = argparser.parse_args()
urls_of_user = defaultdict(list)
with shelve.open('db/slice') as tweets:
for t in tweets.values():
for u in t.mention.urls:
if u.find(args.filter) == -1:
urls_of_user[t.author].append(u)
endorsers_of_url = defaultdict(set)
with shelve.open('db/users') as users:
if args.dump:
for u, ls in urls_of_user.items():
sys.stdout.write(users[u].screen_name)
for l in ls:
sys.stdout.write(' {}'.format(l))
sys.stdout.write('\n')
if args.endorsers:
for u, ls in urls_of_user.items():
for l in ls:
endorsers_of_url[l].add(users[u].screen_name)
if False:
url_counts = defaultdict(int)
for ls in urls_of_user.values():
url_counts[len(ls)] += 1
for sz, cnt in sorted(url_counts.items()):
sys.stderr.write('freq {} {}\n'.format(sz, cnt))
if False:
user_counts = defaultdict(int)
for u, ls in urls_of_user.items():
for l in ls:
user_counts[l] += 1
for cnt, u in sorted((-cnt, u) for u, cnt in user_counts.items()):
sys.stderr.write('freq {} {}\n'.format(-cnt, u))
score_of_url = defaultdict(float)
with shelve.open('db/userrank') as userrank:
def us(uid):
return userrank[uid] if uid in userrank else 0
for u, urls in urls_of_user.items():
if not urls:
continue
s = us(u) / len(urls)
for l in urls:
#sys.stderr.write('{:.2f} from {} to {}\n'.format(s,u,l))
score_of_url[l] += s
with shelve.open('db/urlrank', 'n') as urlrank:
for l, s in score_of_url.items():
urlrank[l] = s
sys.stderr.write('ranked {} urls\n'.format(len(score_of_url)))
xs = sorted((-s, l) for l, s in score_of_url.items())
for s, l in xs[:args.toprint]:
sys.stdout.write('{:9.6f} {}'.format(-s, l))
for u in sorted(endorsers_of_url[l]):
sys.stdout.write(' {}'.format(u))
sys.stdout.write('\n')
if __name__ == '__main__':
main()