Skip to content

Commit cbfd810

Browse files
author
xuming06
committed
add text cluster demo.
1 parent d6501bc commit cbfd810

8 files changed

Lines changed: 1630 additions & 1315 deletions

File tree

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@author:XuMing([email protected])
4+
@description:
5+
"""
6+
7+
from flask import Flask, request
8+
from flask_restful import Api, Resource
9+
10+
app = Flask(__name__)
11+
# 创建一个 Api 对象,把 app 作为参数
12+
api = Api(app)
13+
14+
# 创建 Welcome 类,描述欢迎信息(框架可以序列化任意类型的对象)
15+
class Welcome:
16+
17+
def __init__(self, name):
18+
self.name = name
19+
self.message = "Hello %s, Welcome to flask-restaction!" % name
20+
21+
# 创建一个 Hello 类,定义 get 方法
22+
class Hello:
23+
"""Hello world"""
24+
25+
# 在 get 方法文档字符串中描述输入参数和输出的格式
26+
def get(self, name):
27+
"""
28+
Get welcome message
29+
30+
$input:
31+
name?str&default="world": Your name
32+
$output:
33+
message?str: Welcome message
34+
"""
35+
return Welcome(name)
36+
37+
class HelloWorld(Resource):
38+
def get(self):
39+
return {'hello': 'world'}
40+
41+
api.add_resource(HelloWorld, '/')
42+
43+
44+
45+
todos = {}
46+
47+
class TodoSimple(Resource):
48+
def get(self, todo_id):
49+
return {todo_id: todos[todo_id]}
50+
51+
def put(self, todo_id):
52+
todos[todo_id] = request.form['data']
53+
return {todo_id: todos[todo_id]}
54+
55+
api.add_resource(TodoSimple, '/<string:todo_id>')
56+
57+
58+
if __name__ == '__main__':
59+
app.run(debug=True)
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@author:XuMing([email protected])
4+
@description:
5+
"""
6+
7+
8+
import json
9+
from datetime import datetime
10+
11+
import pymysql
12+
import requests
13+
from flask import Flask, redirect, request
14+
15+
app = Flask(__name__)
16+
17+
18+
19+
class ApiError(Exception):
20+
""" 错误异常处理 """
21+
22+
def __init__(self, code, msg):
23+
super(ApiError, self).__init__()
24+
# 错误状态码
25+
self.code = code
26+
# 错误信息
27+
self.msg = msg
28+
29+
def __str__(self):
30+
return '{0}:{1}'.format(self.code, self.msg)
31+
32+
33+
class ServerError(Exception):
34+
pass
35+
36+
class WeiboClient(object):
37+
API_URL = 'https://api.weibo.com/' # 微博api地址
38+
39+
def __init__(self, client_id, client_secret):
40+
self.client_id = client_id # 应用id
41+
self.client_secret = client_secret # 应用秘钥
42+
self.token = {}
43+
44+
@property
45+
def access_token(self):
46+
if self.token:
47+
return self.token['access_token']
48+
return None
49+
50+
def fetch(self, method, url, params={}):
51+
'''
52+
接口请求的统一封装
53+
'''
54+
try:
55+
if method == 'POST':
56+
resp = requests.post(url, params)
57+
else:
58+
resp = requests.get(url, params)
59+
60+
if resp.status_code >= 200 and resp.status_code < 300:
61+
# 接口正常
62+
rest = resp.json()
63+
if 'error_code' in rest:
64+
raise ApiError(rest['error_code'], rest['error'])
65+
return rest
66+
elif resp.status_code >= 400:
67+
raise ServerError()
68+
except ApiError as e:
69+
print('ApiError')
70+
pass
71+
except ServerError as e:
72+
print('ServerError')
73+
except Exception:
74+
print('Exception')
75+
76+
77+
def get_ticket_url(self, redirect_uri=None):
78+
'''
79+
获取从浏览器跳转的 url
80+
用来获取token
81+
'''
82+
if redirect_uri is None:
83+
redirect_uri = 'http://test.baidu.com'
84+
url = self.API_URL + 'oauth2/authorize?client_id={0}&response_type=code&redirect_uri={1}'.format(
85+
self.client_id,
86+
redirect_uri
87+
)
88+
# get请求
89+
return url
90+
91+
def get_token(self, code):
92+
'''
93+
获取token
94+
'''
95+
# 如果已经有了,则直接返回
96+
if self.token:
97+
return self.token
98+
url = self.API_URL + 'oauth2/access_token?client_id={0}&client_secret={1}&grant_type=authorization_code&redirect_uri=http://test.baidu.com&code={2}'.format(
99+
self.client_id,
100+
self.client_secret,
101+
code
102+
)
103+
resp = self.fetch('POST', url)
104+
self.token = resp.json()
105+
return self.token
106+
107+
def get_user_info(self, access_token, uid):
108+
'''
109+
获取用户信息
110+
'''
111+
url = self.API_URL + '2/users/show.json'
112+
# access_token = self.get_token(code)['access_token']
113+
resp = self.fetch('GET', url, {
114+
'access_token': access_token,
115+
'uid': uid
116+
})
117+
return resp.json()
118+
119+
def get_conn(self):
120+
""" 获取mysql 的连接 """
121+
try:
122+
conn = pymysql.connect(
123+
db='test',
124+
host='localhost',
125+
user='root',
126+
password='xxxxMMMM3333#',
127+
charset='utf8'
128+
)
129+
except:
130+
pass
131+
return conn
132+
133+
def weibo_share(self):
134+
'''
135+
分享数据到微博
136+
'''
137+
138+
url = self.API_URL + '2/statuses/share.json'
139+
resp = self.fetch('POST', url, {
140+
'status': '现在是北京时间: {0} http://test.baidu.com'.format(datetime.now())
141+
})
142+
return resp
143+
144+
client_id = ''
145+
client_secret = ''
146+
client = WeiboClient(client_id, client_secret)
147+
148+
149+
# 登录
150+
@app.route('/')
151+
def index():
152+
code = request.args.get('code', 200)
153+
# 根据code来获取token
154+
token = client.get_token(code)
155+
# 获取用户信息
156+
157+
user_info = client.get_user_info(token['access_token'], token['uid'])
158+
third_id = user_info['id']
159+
nickname = user_info['screen_name']
160+
headimg = user_info['profile_image_url']
161+
162+
# 获取数据库的链接
163+
conn = client.get_conn()
164+
cursor = conn.cursor()
165+
sql = "INSERT INTO `user`(`third_id`, `nickname`, `headimg`) VALUES('{third_id}', '{nickname}', '{headimg}')".format(
166+
third_id=third_id, nickname=nickname, headimg=headimg)
167+
cursor.execute(sql)
168+
conn.autocommit(True)
169+
return json.dumps(user_info)
170+
171+
172+
# 登录回调获取token
173+
@app.route('/weibo')
174+
def weibo():
175+
ticket = client.get_ticket_url()
176+
return redirect(ticket)
177+
178+
# 分享
179+
@app.route('/share')
180+
def share():
181+
rest = client.weibo_share()
182+
return json.dumps(rest)
183+
184+
if __name__ == '__main__':
185+
app.run(debug=True, port=8010)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
@author: XuMing <[email protected]>
5+
@summary:
6+
"""
7+
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@author:XuMing([email protected])
4+
@description:
5+
"""
6+
7+
import math
8+
9+
import jieba
10+
import jieba.analyse
11+
import numpy as np
12+
13+
14+
def read_words(file_path):
15+
words = set()
16+
with open(file_path, "r", encoding='utf-8') as f:
17+
for line in f:
18+
line = line.strip()
19+
words.add(line)
20+
return words
21+
22+
23+
def trim_stopwords(words, stop_words_set):
24+
"""
25+
去除切词文本中的停用词
26+
:param words:
27+
:param stop_words_set:
28+
:return:
29+
"""
30+
new_words = []
31+
for w in words:
32+
if w in stop_words_set:
33+
continue
34+
new_words.append(w)
35+
return new_words
36+
37+
38+
def tfidf(term, doc, word_dict, doc_set):
39+
tf = float(doc.count(term)) / (len(doc) + 0.001)
40+
idf = math.log(float(len(doc_set)) / word_dict[term])
41+
return tf * idf
42+
43+
44+
def idf(term, word_dict, docset):
45+
idf = math.log(float(len(docset)) / word_dict[term])
46+
return idf
47+
48+
49+
def get_all_vector(file_path, stop_words):
50+
names = []
51+
docs = []
52+
word_set = set()
53+
with open(file_path, 'r', encoding='utf-8')as f:
54+
for line in f:
55+
line = line.strip()
56+
cols = line.split("\t")
57+
userid = cols[0]
58+
names.append(userid)
59+
content = " ".join(cols[1:])
60+
content = content.lower().replace("{地域}{投放地域}", "").replace("{关键词}", "")
61+
words = jieba.lcut(content)
62+
doc = trim_stopwords(words, stop_words)
63+
docs.append(doc)
64+
word_set |= set(doc)
65+
66+
docs_vsm = []
67+
for doc in docs:
68+
temp_vector = []
69+
for word in word_set:
70+
temp_vector.append(doc.count(word) * 1.0)
71+
# print temp_vector[-30:-1]
72+
docs_vsm.append(temp_vector)
73+
74+
docs_matrix = np.array(docs_vsm)
75+
# print docs_matrix.shape
76+
# print len(np.nonzero(docs_matrix[:,3])[0])
77+
column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])]
78+
column_sum = np.array(column_sum)
79+
column_sum = docs_matrix.shape[0] / column_sum
80+
idf = np.log(column_sum)
81+
idf = np.diag(idf)
82+
# print idf.shape
83+
# row_sum = [ docs_matrix[i].sum() for i in range(docs_matrix.shape[0]) ]
84+
# print idf
85+
# print column_sum
86+
tfidf = np.dot(docs_matrix, idf)
87+
88+
return names, tfidf
89+
90+
91+
def gen_sim(A, B):
92+
num = float(np.dot(A, B.T))
93+
denum = np.linalg.norm(A) * np.linalg.norm(B)
94+
if denum == 0:
95+
denum = 1
96+
cosn = num / denum
97+
sim = 0.5 + 0.5 * cosn
98+
return sim
99+
100+
101+
def rand_center(data_set, k):
102+
n = np.shape(data_set)[1]
103+
centroids = np.mat(np.zeros((k, n))) # create centroid mat
104+
for j in range(n): # create random cluster centers, within bounds of each dimension
105+
minJ = min(data_set[:, j])
106+
rangeJ = float(max(data_set[:, j]) - minJ)
107+
centroids[:, j] = np.mat(minJ + rangeJ * np.random.rand(k, 1))
108+
return centroids
109+
110+
111+
def kmeans(data_set, k):
112+
m = np.shape(data_set)[0]
113+
clusterAssment = np.mat(np.zeros((m, 2))) # create mat to assign data points
114+
# to a centroid, also holds SE of each point
115+
centroids = rand_center(data_set, k)
116+
counter = 0
117+
while counter <= 50:
118+
counter += 1
119+
for i in range(m): # for each data point assign it to the closest centroid
120+
minDist = np.inf
121+
minIndex = -1
122+
for j in range(k):
123+
distJI = gen_sim(centroids[j, :], data_set[i, :])
124+
if distJI < minDist:
125+
minDist = distJI
126+
minIndex = j
127+
clusterAssment[i, :] = minIndex, minDist ** 2
128+
# print centroids
129+
for cent in range(k): # recalculate centroids
130+
ptsInClust = data_set[np.nonzero(clusterAssment[:, 0].A == cent)[0]] # get all the point in this cluster
131+
centroids[cent, :] = np.mean(ptsInClust, axis=0) # assign centroid to mean
132+
return centroids, clusterAssment
133+
134+
135+
if __name__ == "__main__":
136+
stop_words = read_words("../../data/stopword.txt")
137+
names, tfidf_mat = get_all_vector("./yl_10.txt", stop_words)
138+
myCentroids, clustAssing = kmeans(tfidf_mat, 8)
139+
for label, name in zip(clustAssing[:, 0], names):
140+
print(label, name)

0 commit comments

Comments
 (0)