-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtmlStats.js
More file actions
162 lines (151 loc) · 5.03 KB
/
htmlStats.js
File metadata and controls
162 lines (151 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
const http = require('http');
const https = require('https');
const util = require('util');
const readline = require('readline');
const htmlparser = require("htmlparser");
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
/**
* Prompts user for html page address in the format http://www.mywebsite.com,
* tries to get page source (with http or https protocols according to the address specified-
* no other protocols are supported) and parses the retrieved source.
* @function
* @augments
*
*/
function readLine(){
rl.question('Insert address: ', (answer) => {
console.log(`Address inserted: ${answer}`);
try{
var substrs = answer.split("://");
var protocolStr = substrs[0].toLowerCase();
var protocol;
if(protocolStr=='http')
protocol=http;
else if(protocolStr=='https')
protocol=https;
else
throw "unable to make request for address "+answer;
requestAndParse(answer, protocol, (info)=>{
if(info){
console.log("jsobject="+util.inspect(info, false, null));
console.log("json="+JSON.stringify(info, false, null));
}
readLine();
});
} catch(err){
console.log("error: "+err);
readLine();
}
});
};
/**
* Requests page source of address, using protocol (http or https),
* and parses the retrieved source.
* Searches html dom tree depth-first and collects meta info on the page html source, namely:
* - list of all children nodes by each tag type (property tagChildrenInfo)
* - total count of of children nodes by each tag type (property tagChildren)
* - list of resources loaded by the page by resource type, if specified (property resources)
* - total count of attributes by tag type (property tagCount)
* - maximum depth of html dom tree (property treeDepth)
* Meta info is passed to finishedCB as a javascript object, with the mentioned properties.
*
* @function
* @arguments
*
*/
function requestAndParse(address, protocol, finishedCB){
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if(error)
console.log("error "+error);
//finishedCB();
});
var parser = new htmlparser.Parser(handler);
protocol.get(address, (res) => {
//console.log(`STATUS: ${res.statusCode}`);
//console.log(`HEADERS: ${JSON.stringify(res.headers)}`);
res.setEncoding('utf8');
res.on('data', (chunk) => {
//console.log(`BODY: ${chunk}`);
parser.parseChunk(chunk);
//console.log(chunk);
});
res.on('end', () => {
parser.done();
//console.log("dom="+util.inspect(handler.dom, false, null));
var tagMap = {};
var tagAttributes = {};
var resources = {};
var tagChildren = {};
var tagChildrenInfo = {};
var maxDepth=0;
var stIdx;
for(stIdx = 0; stIdx < handler.dom.length && !(handler.dom[stIdx].type=='tag' && (handler.dom[stIdx].name=='html' || handler.dom[stIdx].name=='HTML')); stIdx++);
if(stIdx>=handler.dom.length){
console.log("error: root element not found");
finishedCB();
}else {
handler.dom[stIdx]["depth"]=0;
var nodesToVisit=[handler.dom[stIdx]];
while(nodesToVisit.length>0){
var currNode = nodesToVisit.pop();
//visita
if(currNode.type=='tag' || currNode.type=='script' || currChild.type=='style'){
if(tagMap[currNode.name]==undefined){
tagMap[currNode.name]=0;
tagAttributes[currNode.name]=0;
tagChildren[currNode.name]=0;
tagChildrenInfo[currNode.name]=[];
}
tagMap[currNode.name]++;
if(currNode.attribs)
for(attr in currNode.attribs)
tagAttributes[currNode.name]++;
for(c=0;currNode.children && c<currNode.children.length;c++){
var currChild=currNode.children[c];
if(currChild.type=='tag' || currChild.type=='script' || currChild.type=='style'){
tagChildren[currNode.name]++;
tagChildrenInfo[currNode.name].push(currChild.data);
}
}
}
if(currNode.attribs && currNode.attribs.src){
var resName=currNode.name;
if((currNode.name=='source' || currNode.name=='SOURCE') && currNode.attribs.type){
resName=currNode.attribs.type;
}
if(!resources[resName])
resources[resName]=[];
resources[resName].push(currNode.attribs.src);
}
//console.log("visited node "+currNode.name+ " ("+currNode.type+")");
//marca proximas visitas
if(currNode.children != undefined && currNode.children.length>0){
for(i = 0; i < currNode.children.length; i++){
var child = currNode.children[i];
child["depth"]=currNode.depth+1;
nodesToVisit.push(child);
}
//console.log("added "+currNode.children.length+" children to visit");
if(maxDepth<currNode.depth+1){
maxDepth=currNode.depth+1;
//console.log("updated maxDepth="+maxDepth);
}
}
}
var ret={};
ret["tagChildrenInfo"]=tagChildrenInfo;
ret["tagChildren"]=tagChildren;
ret["resources"]=resources;
ret["tagAttributesCount"]=tagAttributes;
ret["tagCount"]=tagMap;
ret["treeDepth"]=maxDepth;
finishedCB(ret);
}
});
});
};
module.exports.requestAndParse = requestAndParse;
module.exports.readLine = readLine;