forked from akkana/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweborphans
More file actions
executable file
·340 lines (291 loc) · 12.5 KB
/
weborphans
File metadata and controls
executable file
·340 lines (291 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python
# Check a website (perhaps localhost) against a local mirror.
# Find broken links and orphaned files.
# You must specify both the directory, and a web URL to a server
# (e.g. localhost) that is serving that directory.
import sys, os
import posixpath
import re
import urllib2, urlparse, urllib
# from bs4 import BeautifulSoup
from BeautifulSoup import BeautifulSoup
class Spider:
def __init__(self, rootdir, starturl):
self.debug = False
self.starturl = starturl
self.rootdir = os.path.normpath(rootdir)
if not os.path.isdir(rootdir):
# It's not a directory, so take the dirname, but save the filename.
self.rootdir, rootfile = os.path.split(rootdir)
else:
# It's already a directory, so self.rootdir is fine.
rootfile = None
# XXX This next bit isn't platform-agnostic:
if not self.rootdir.endswith('/'):
self.rootdir += '/'
# Now we need to get the true root url. The starturl may have
# something like /index.html appended to it; we need something
# we can prepend to paths.
# Extract any path information from the root url:
parsed = urlparse.urlparse(starturl)
self.scheme = parsed.scheme
self.host = parsed.netloc
self.rooturlpath = posixpath.normpath(parsed.path)
dirpart, basepart = posixpath.split(self.rooturlpath)
# If the path is a directory and ends in / (as it should)
# then posixpath will split on that slash, not the previous one.
if not basepart:
dirpart, basepart = posixpath.split(dirpart)
# Now basepart is the last part of the path, which might
# be a directory name on the server or it might be index.*
# Compare it to the last part of self.rootdir, which is
# guaranteed to be a directory.
# But we have to split it twice, because self.rootdir ends in /
# so the first split will return '' as the basename.
lastdir = posixpath.basename(posixpath.dirname(self.rootdir))
if basepart != lastdir:
self.rooturlpath = posixpath.dirname(self.rooturlpath)
if not self.rooturlpath.endswith('/'):
self.rooturlpath += '/'
# Now we're confident self.rooturlpath is the base directory.
# Add the schema and host back on.
self.rooturl = urlparse.urlunsplit((self.scheme, self.host,
self.rooturlpath, None, None))
if not self.rooturl.endswith('/'):
self.rooturl += '/'
print "rootdir:", self.rootdir
print "rooturl:", self.rooturl
print "rooturlpath:", self.rooturlpath
print "scheme:", self.scheme
print "host:", self.host
print
self.urls_to_check = [ self.rooturl ]
self.urls_succeeded = []
self.urls_failed = []
self.outside_urls = []
self.files_succeeded = []
# Eventually, the list of excludes should be a commandline argument.
# For now, let's just make sure all the .git objects aren't orphaned,
# nor web stats or archived files.
self.excludes = [ ".git", "stats", "0-pre2011", "0-calendars" ]
# Files that aren't explicitly referenced by the website,
# but might be needed for other purposes.
self.nonorphans = [ "favicon.ico", "robots.txt", ".htaccess" ]
def spide(self):
'''Check all urls in urls_to_check, which has new urls
being added to it during the spidering process.
'''
self.check_url(self.starturl)
while self.urls_to_check:
self.check_url(self.urls_to_check.pop())
print "Done spiding"
def check_orphans(self):
'''Assuming we already have self.files_succeeded,
find all files in self.rootdir that weren't in succeeded.
'''
self.orphans = []
for root, dirs, files in os.walk(self.rootdir, topdown=True):
dirs[:] = [d for d in dirs if d not in self.excludes]
for filename in files:
if filename in self.nonorphans:
continue
f = os.path.join(root, filename)
if f not in self.files_succeeded:
self.orphans.append(f)
def print_summary(self):
print
print "URLs succeeded:"
print '\n'.join(self.urls_succeeded)
print
print "Outside URLs:"
print '\n'.join(self.outside_urls)
print
print "URLs failed:"
print '\n'.join(self.urls_failed)
print
print "Orphans:"
print '\n'.join(self.orphans)
print
print len(self.urls_succeeded), "good links,", \
len(self.outside_urls), "external urls not checked,", \
len(self.urls_failed), "bad links,", \
len(self.orphans), "orphaned files."
def get_local_for_url(self, urlpath):
'''Get a local file path for a path parsed from an absolute URL.
'''
# Now compare parsed.path with self.rooturlpath
if self.rooturlpath not in urlpath:
return None
return os.path.normpath(urlpath.replace(self.rooturlpath,
self.rootdir,
1))
def make_absolute(self, url, relative_to):
'''Make a URL absolute. If it's a relative path,
then make it relative to relative_to
which must be an absolute path on the webhost.
'''
parsed = urlparse.urlparse(url)
if parsed.scheme: # already has an http://host specified
# XXX If we ever extend this to check validity of
# external URLs, this next condition is the one to change.
if parsed.netloc != self.host:
if self.debug:
print "Ignoring external link", url
return None
return url
# So there's no scheme. Add one.
if parsed.path.startswith('/'):
# The results of urlparse() aren't modifiable, but
# if we turn them into a list we can modify them
# then turn them back into a URL.
lurl = list(parsed)
lurl[0] = self.scheme
lurl[1] = self.host
return urlparse.urlunparse(lurl)
# Otherwise it's relative to urldir. Make it absolute, normalized.
lurl = list(parsed)
lurl[0] = self.scheme
lurl[1] = self.host
lurl[2] = posixpath.normpath(posixpath.join(relative_to, parsed.path))
return urlparse.urlunparse(lurl)
def check_url(self, url):
'''Check a URL. This should be an absolute URL on the server.'''
# If we got this far, we'll be comparing links.
# So we'll need to know the parsed parts of this url.
urlparsed = urlparse.urlparse(url)
if not urlparsed.scheme or not urlparsed.path.startswith('/'):
print "EEK! Non-relative URL passed to check_url, bailing"
return
# URL encode special characters like spaces:
urlpath = urllib.quote(urlparsed.path)
# This check must come after the special char substitution.
if urlpath in self.urls_succeeded or urlpath in self.urls_failed:
return
if self.debug:
print "=============================== Checking", url
# Now we need just the directory part. This might be
# dirname(urlparsed.path), if the url is a file, or it
# might just be urlparsed.path if that's already a directory.
# The only way to know is to check on the local filesystem.
# But here's the tricky part: to get the absolute path,
# we need to know what relative links are relative_to,
# but if they themselves XXX
localpath = self.get_local_for_url(urlparsed.path)
if self.debug:
print "=== local for", urlpath, "is", localpath
if not localpath:
if self.debug:
print urlparsed.path, "is outside original directory; skipping"
if url not in self.outside_urls:
self.outside_urls.append(url)
return
if not os.path.exists(localpath):
if self.debug:
print "Local path '%s' doesn't exist! %s" % (localpath, url)
self.urls_failed.append(urlpath)
return
# If we substituted any special characters, rebuild the URL:
if urlpath != urlparsed.path:
lurl = list(urlparsed)
lurl[2] = urlpath
url = urlparse.urlunparse(lurl)
if self.debug:
print "Substituted characters, recombined to", url
if os.path.isdir(localpath):
# The web server will substitute index.something,
# so we'd better do that too or else the index file
# will show up as an orphan.
localdir = localpath
localpath = None
for ext in ( "php", "cgi", "html" ):
indexfile = os.path.join(localdir, "index." + ext)
if os.path.exists(indexfile):
localpath = indexfile
break
if not localpath:
print "Can't find an index file inside", localdir
return
urldir = urlpath
else:
localdir = os.path.dirname(localpath)
urldir = posixpath.dirname(urlpath)
if self.debug:
print "localpath", localpath, "localdir", localdir
print "urldir:", urldir
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
if not handle:
print "Can't open", url
# request.add_header("User-Agent", AGENT)
try:
response = handle.open(request)
info = response.info()
if 'content-type' not in info.keys() or \
not info['content-type'].startswith('text/html'):
if self.debug:
print url, "isn't HTML; skipping"
self.urls_succeeded.append(urlpath)
self.files_succeeded.append(localpath)
return
content = unicode(response.read(), "utf-8", errors="replace")
except urllib2.HTTPError, error:
if error.code == 404:
print "ERROR: %s -> %s" % (error, error.url)
else:
print "ERROR: %s" % error
self.urls_failed.append(urlpath)
return
except urllib2.URLError, error:
print "ERROR: %s" % error
self.urls_failed.append(urlpath)
return
self.urls_succeeded.append(urlpath)
self.files_succeeded.append(localpath)
ctype = response.headers['content-type']
if not ctype.startswith("text/html"):
if self.debug:
print url, "isn't HTML (%s); not reading content" % ctype
return
soup = BeautifulSoup(content)
for tag in soup.findAll('a', href=True):
href = tag.get("href")
if not href:
continue
if href[0] == '#':
continue
href = self.make_absolute(href, urldir)
if not href:
# It's probably an external URL. Skip it.
href = tag.get("href")
if href not in self.outside_urls:
self.outside_urls.append(href)
continue
# This check won't get everything, because href
# hasn't been special char substituted yet.
if href not in self.urls_to_check and \
href not in self.urls_succeeded and \
href not in self.urls_failed:
self.urls_to_check.append(href)
for tag in soup.findAll('img', src=True):
src = self.make_absolute(tag.get('src'), urldir)
if not src:
self.outside_urls.append(tag.get('src'))
continue
# self.urls_succeeded.append(src)
urlparsed = urlparse.urlparse(src)
localpath = self.get_local_for_url(urlparsed.path)
self.urls_to_check.append(src)
if __name__ == '__main__':
if len(sys.argv) < 3:
print "Usage: %s local_dir url" % os.path.basename(sys.argv[0])
sys.exit(1)
spider = Spider(sys.argv[1], sys.argv[2])
try:
spider.spide()
spider.check_orphans()
spider.print_summary()
except KeyboardInterrupt:
print "Interrupt"