python/test/test_re.py at master · lixiangflyin/python

History

125 lines (108 loc) · 3.17 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

# coding=utf-8

__author__ = 'zhanghe'

import re

import json

test_html = '''<dl class="lineDl">

<dd>

<div class="inptext_fl ">应届毕业生</div>

</dd>

</dl>

<dd>

</dd>

</dl>

<dd>

<div class="inptext_fl ">目前正在找工作</div>

</dd>

</dl>

</dd>

<dd>

</dd>

</dl>'''

def re_html(html):

"""

通过正则表达式获取关键数据

将两个有序的列表组合为无序的字典

:param html:

:return:

"""

reg_dt = r'<dt>(.+?)：</dt>'

reg_dd = r'<div class="inptext_fl ">(.+?)</div>'

dt = re.compile(reg_dt)

dd = re.compile(reg_dd)

dt_list = re.findall(dt, html)

dd_list = re.findall(dd, html)

zip_list = zip(dt_list, dd_list)

html_dict = dict((name, value) for name, value in zip_list)

return json.dumps(html_dict, ensure_ascii=False, indent=4)

def get_email(html=None):

"""

从文本中提取email

"""

if html is None:

return []

email_rule = r'[^\_\@\s\W][\w\_\-\.]{1,}\@(?:[^\s\.]{1,}\.){1,}(?:[a-z]{2,4}\.?){1,2}'

email_list = re.compile(email_rule, re.S).findall(html)

# print json.dumps(email_list, ensure_ascii=False, indent=4)

# email_list_new = []

# for item in email_list:

# email_list_new.append(item.lower())

# return email_list_new

return [item.lower() for item in email_list]

def test_has_key(html=u''):

"""

测试是否包含某关键字

"""

# html = u'''某大型公司职位'''

rule = ur'代招|某知名|猎头职位|某互联网|某.*公司'

key_list = re.compile(rule, re.S).findall(html)

print ' '.join(key_list) # 匹配的关键词

if key_list:

print '代招职位'

else:

print '正常职位'

if __name__ == '__main__':

# print re_html(test_html)

html_test = '''邮箱：[email protected] [email protected] [email protected] [email protected] [email protected] @[email protected] @[email protected] [email protected] 地址：上海市'''

email_result = get_email(html_test)

print json.dumps(email_result, ensure_ascii=False, indent=4)

test_has_key(u'''某大型公司职位''')

"""

测试结果

[

"[email protected]",

"[email protected]"

]

"""

(?:pattern)

匹配 pattern 但不获取匹配结果

例如， 'industr(?:y|ies) 就是一个比 'industry|industries' 更简略的表达式。

"""

修饰符描述

re.I 使匹配对大小写不敏感

re.L 做本地化识别（locale-aware）匹配

re.M 多行匹配，影响 ^ 和 $

re.S 使 . 匹配包括换行在内的所有字符

re.U 根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.

re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。

"""

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_re.py

Latest commit

History

test_re.py

File metadata and controls