forked from zhanghe06/python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_re.py
More file actions
125 lines (108 loc) · 3.17 KB
/
test_re.py
File metadata and controls
125 lines (108 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# coding=utf-8
__author__ = 'zhanghe'
import re
import json
test_html = '''<dl class="lineDl">
<dt>工作年限:</dt>
<dd>
<div class="inptext_fl ">应届毕业生</div>
<input type="hidden" id="WorkYear" value="2"/>
</dd>
</dl>
<dl class="lineDl">
<dt>居住地:</dt>
<dd>
<div class="inptext_fl ">清远</div>
</dd>
</dl>
<dl class="lineDl">
<dt>求职状态:</dt>
<dd>
<div class="inptext_fl ">目前正在找工作</div>
</dd>
</dl>
<dl class="lineDl">
<dt>身高:</dt>
<dd class="dd_textW">
<div class="inptext_fl ">180cm </div>
</dd>
<dt>婚姻状况:</dt>
<dd>
<div class="inptext_fl "> </div>
</dd>
</dl>'''
def re_html(html):
"""
通过正则表达式获取关键数据
将两个有序的列表组合为无序的字典
:param html:
:return:
"""
reg_dt = r'<dt>(.+?):</dt>'
reg_dd = r'<div class="inptext_fl ">(.+?)</div>'
dt = re.compile(reg_dt)
dd = re.compile(reg_dd)
dt_list = re.findall(dt, html)
dd_list = re.findall(dd, html)
zip_list = zip(dt_list, dd_list)
html_dict = dict((name, value) for name, value in zip_list)
return json.dumps(html_dict, ensure_ascii=False, indent=4)
def get_email(html=None):
"""
从文本中提取email
"""
if html is None:
return []
email_rule = r'[^\_\@\s\W][\w\_\-\.]{1,}\@(?:[^\s\.]{1,}\.){1,}(?:[a-z]{2,4}\.?){1,2}'
email_list = re.compile(email_rule, re.S).findall(html)
# print json.dumps(email_list, ensure_ascii=False, indent=4)
# email_list_new = []
# for item in email_list:
# email_list_new.append(item.lower())
# return email_list_new
return [item.lower() for item in email_list]
def test_has_key(html=u''):
"""
测试是否包含某关键字
"""
# html = u'''某大型公司职位'''
rule = ur'代招|某知名|猎头职位|某互联网|某.*公司'
key_list = re.compile(rule, re.S).findall(html)
print ' '.join(key_list) # 匹配的关键词
if key_list:
print '代招职位'
else:
print '正常职位'
if __name__ == '__main__':
# print re_html(test_html)
html_test = '''邮箱:[email protected] [email protected] [email protected] [email protected] [email protected] @[email protected] @[email protected] [email protected] 地址:上海市'''
email_result = get_email(html_test)
print json.dumps(email_result, ensure_ascii=False, indent=4)
test_has_key(u'''某大型公司职位''')
"""
测试结果
[
]
"""
"""
(?:pattern)
匹配 pattern 但不获取匹配结果
例如, 'industr(?:y|ies) 就是一个比 'industry|industries' 更简略的表达式。
"""
"""
修饰符 描述
re.I 使匹配对大小写不敏感
re.L 做本地化识别(locale-aware)匹配
re.M 多行匹配,影响 ^ 和 $
re.S 使 . 匹配包括换行在内的所有字符
re.U 根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.
re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。
"""