-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathapp.js
More file actions
120 lines (93 loc) · 3.15 KB
/
app.js
File metadata and controls
120 lines (93 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
var eventproxy=require('eventproxy');
var superagent=require('superagent');
var cheerio=require('cheerio');
var url=require('url');
var express=require('express');
var async = require('async');
var app=express();
var cnodeUrls=[];
//入口html
for(var i=1 ; i<= 2; i++){
cnodeUrls.push('https://cnodejs.org/?tab=good&page='+i);
}
var result=[],flag=0;
function getUrls(){
cnodeUrls.forEach(function(cnodeUrl){
superagent.get(cnodeUrl)
.end(function(err,res){
if(err){
return console.error(err);
}
var $=cheerio.load(res.text);
$('#topic_list .topic_title').each(function (idx, element) {
var flag=0;
var $element = $(element);
var href = url.resolve(cnodeUrl, $element.attr('href'));
//去重,防止刷新后数据的重复获取;
//主要是因为result是全局变量
for(var i=0;i<result.length;i++){
if(result[i]==href){
flag=1;
}
}
if(flag==0){
result.push(href);
}
});
});
});
return result;
}
app.get('/',function(req,sres,next){
var num=1;
sres.writeHead(200, {'Content-Type': 'text/html;charset=utf-8'});
var topicUrls=getUrls();
var eq=new eventproxy();
eq.after('topic_html',topicUrls.length,function(topics){
topics.map(function(topicPair){
var topicUrl=topicPair[0];
var topichtml=topicPair[1];
var $=cheerio.load(topichtml);
// return({
sres.write('<b>'+num+"</b> title: "+$('.topic_full_title').text().trim()+'<br>');
sres.write("评论:"+$('.reply_content').eq(0).text().trim()+'<br>');
sres.write("url: "+topicUrl+'<br><br>');
num++;
//href: topicUrl,
// comment1:$('.reply_content').eq(0).text().trim(),
//});
});
//sres.send(topics);
console.log("finish");
});
var curCount = 0;
var reptileMove = function(url,callback){
//延迟毫秒数
var delay = parseInt((Math.random() * 30000000) % 1000, 10);
curCount++;
console.log('现在的并发数是', curCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒');
superagent.get(url)
.end(function(err,sres){
// 常规的错误处理
if (err) {
console.log(err);
return;
}
//sres.text 里面存储着请求返回的 html 内容
eq.emit('topic_html',[url,sres.text]);
});
setTimeout(function() {
curCount--;
callback(null,[url,sres.text]);
}, delay);
};
// 使用async控制异步抓取
// mapLimit(arr, limit, iterator, [callback])
// 异步回调
async.mapLimit(topicUrls, 5, function (topicUrl, callback) {
reptileMove(topicUrl, callback);
});
});
app.listen(3000,function(){
console.log('app is run')
})