-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathinitial_upload.py
More file actions
54 lines (45 loc) · 1.71 KB
/
initial_upload.py
File metadata and controls
54 lines (45 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from itertools import islice
import xml.etree.ElementTree as ET
from elasticsearch import Elasticsearch
from dateutil.parser import parse as parse_date
import html
import re
from bs4 import BeautifulSoup
from io import BytesIO
from tokenize import tokenize, NAME
es = Elasticsearch()
def tokenize_code(code):
soup = BeautifulSoup(code, 'html.parser')
var_names = set()
for code_block in soup.find_all('code'):
try:
for ttype, tval, *rest in tokenize(BytesIO(code_block.text.encode('utf-8')).readline):
if ttype == NAME:
var_names.add(tval)
except Exception:
pass
return list(var_names)
context = ET.iterparse('/mnt/sopython-db/Posts.xml')
context = iter(context)
event, root = next(context)
inserted = 0
for idx, (event, elem) in islice(enumerate(context), 1000000):
if idx % 1000 == 0:
print(idx, 'processed and', inserted, 'inserted')
if event == 'end' and elem.tag == 'row':
post_type = elem.get('PostTypeId')
post_id = int(elem.get('Id'))
for k, v in elem.attrib.items():
if k.endswith(('Id', 'Count')):
elem.attrib[k] = int(v)
elif k.endswith('Date'):
elem.attrib[k] = parse_date(v)
if post_type == '1' and elem.get('LastActivityDate').year >= 2015:
tags = re.findall('<(.*?)>', html.unescape(elem.get('Tags')))
elem.attrib['Tags'] = tags
if any('python' in tag for tag in tags):
elem.attrib['code_names'] = tokenize_code(elem.get('Body'))
es.index(index='python', doc_type='q', id=post_id, body=elem.attrib)
inserted += 1
elem.clear()
root.clear()