Skip to content

Commit 3465933

Browse files
ShaneQfulThibaut
authored andcommitted
Added dojo to devdocs & ability to define headers in scraper requests
1 parent 59778f7 commit 3465933

7 files changed

Lines changed: 40 additions & 137 deletions

File tree

lib/docs/core/scrapers/url_scraper.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@ module Docs
22
class UrlScraper < Scraper
33
class << self
44
attr_accessor :params
5+
attr_accessor :headers
56

67
def inherited(subclass)
78
super
89
subclass.params = params.deep_dup
10+
subclass.headers = headers.deep_dup
911
end
1012
end
1113

1214
self.params = {}
15+
self.headers = { 'User-Agent' => 'devdocs.io' }
1316

1417
private
1518

@@ -22,7 +25,7 @@ def request_all(urls, &block)
2225
end
2326

2427
def request_options
25-
{ params: self.class.params }
28+
{ params: self.class.params, headers: self.class.headers }
2629
end
2730

2831
def process_response?(response)

lib/docs/filters/dojo/clean_html.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,15 @@ module Docs
22
class Dojo
33
class CleanHtmlFilter < Filter
44
def call
5-
# TODO: Probably needs a little more cleanup but should do for the moment
65
css('script').remove
6+
7+
css('.version').remove
8+
9+
#Remove links which are broken on the methods
10+
doc.css(".functionIcon a").each do |a|
11+
a.replace a.content
12+
end
13+
714
doc
815
end
916
end

lib/docs/filters/dojo/entries.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ def get_name
88
def get_type
99
name
1010
end
11-
# TODO:Figure out how to solve the internal links issue later
1211
end
1312
end
1413
end

lib/docs/scrapers/dojo.rb

Lines changed: 16 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@ class Dojo < UrlScraper
44
self.slug = 'dojo'
55
self.type = 'dojo'
66
self.version = '1.10'
7-
self.base_url = 'http://dojotoolkit.org/api/1.10/' #tree.json
7+
self.base_url = 'http://dojotoolkit.org/api/1.10/'
8+
9+
# This is a cut down list of the actually paths taken from the tree.json api on the dojo site
10+
# Dojo used javascript and xhr requests to allow users to browse it's documentation so it can't
11+
# be scrapped by just following links from the base page. This list was generating with a little
12+
# bash and then cut down in order to remove a lot of the more unused documentation e.g. kernel,
13+
# main, dnd and some others
814
self.initial_paths = %w(
915
dojo/AdapterRegistry
1016
dojo/aspect
@@ -17,65 +23,18 @@ class Dojo < UrlScraper
1723
dojo/_base/config.modulePaths
1824
dojo/_base/connect
1925
dojo/_base/declare
20-
dojo/_base/declare.__DeclareCreatedObject
2126
dojo/_base/Deferred
2227
dojo/_base/event
2328
dojo/_base/fx
2429
dojo/_base/html
2530
dojo/_base/json
2631
dojo/_base/kernel
27-
dojo/_base/kernel.back
28-
dojo/_base/kernel.cldr
29-
dojo/_base/kernel.colors
30-
dojo/_base/kernel.config
31-
dojo/_base/kernel.contentHandlers
32-
dojo/_base/kernel._contentHandlers
33-
dojo/_base/kernel.currency
34-
dojo/_base/kernel.data
35-
dojo/_base/kernel.date
36-
dojo/_base/kernel.dijit
37-
dojo/_base/kernel.dnd
38-
dojo/_base/kernel.doc
39-
dojo/_base/kernel.dojox
40-
dojo/_base/kernel.fx
41-
dojo/_base/kernel.gears
42-
dojo/_base/kernel.global
43-
dojo/_base/kernel._hasResource
44-
dojo/_base/kernel.html
45-
dojo/_base/kernel.i18n
46-
dojo/_base/kernel.io
47-
dojo/_base/kernel.__IoArgs
48-
dojo/_base/kernel.__IoCallbackArgs
49-
dojo/_base/kernel.__IoPublish
50-
dojo/_base/kernel.keys
51-
dojo/_base/kernel.mouseButtons
52-
dojo/_base/kernel._nodeDataCache
53-
dojo/_base/kernel.number
54-
dojo/_base/kernel.regexp
55-
dojo/_base/kernel.rpc
56-
dojo/_base/kernel.scopeMap
57-
dojo/_base/kernel.Stateful
58-
dojo/_base/kernel.store
59-
dojo/_base/kernel.string
60-
dojo/_base/kernel.tests
61-
dojo/_base/kernel.touch
62-
dojo/_base/kernel.version
63-
dojo/_base/kernel.window
64-
dojo/_base/kernel.__XhrArgs
6532
dojo/_base/lang
6633
dojo/_base/loader
6734
dojo/_base/NodeList
6835
dojo/_base/query
6936
dojo/_base/sniff
7037
dojo/_base/unload
71-
dojo/_base/url
72-
dojo/_base/url.authority
73-
dojo/_base/url.fragment
74-
dojo/_base/url.password
75-
dojo/_base/url.port
76-
dojo/_base/url.query
77-
dojo/_base/url.scheme
78-
dojo/_base/url.user
7938
dojo/_base/window
8039
dojo/_base/window.doc
8140
dojo/_base/window.global
@@ -88,8 +47,6 @@ class Dojo < UrlScraper
8847
dojo/colors
8948
dojo/cookie
9049
dojo/currency
91-
dojo/currency.__FormatOptions
92-
dojo/currency.__ParseOptions
9350
dojo/data/api/Identity
9451
dojo/data/api/Item
9552
dojo/data/api/Notification
@@ -104,33 +61,10 @@ class Dojo < UrlScraper
10461
dojo/data/util/sorter
10562
dojo/date
10663
dojo/date/locale
107-
dojo/date/locale.__FormatOptions
10864
dojo/date/stamp
10965
dojo/debounce
11066
dojo/Deferred
11167
dojo/DeferredList
112-
dojo/dnd/autoscroll
113-
dojo/dnd/autoscroll._validNodes
114-
dojo/dnd/autoscroll._validOverflow
115-
dojo/dnd/AutoSource
116-
dojo/dnd/Avatar
117-
dojo/dnd/common
118-
dojo/dnd/common._defaultCreatorNodes
119-
dojo/dnd/common._empty
120-
dojo/dnd/Container
121-
dojo/dnd/Container.__ContainerArgs
122-
dojo/dnd/Manager
123-
dojo/dnd/move
124-
dojo/dnd/Moveable
125-
dojo/dnd/Moveable.__MoveableArgs
126-
dojo/dnd/move.boxConstrainedMoveable
127-
dojo/dnd/move.constrainedMoveable
128-
dojo/dnd/move.parentConstrainedMoveable
129-
dojo/dnd/Mover
130-
dojo/dnd/Selector
131-
dojo/dnd/Source
132-
dojo/dnd/Target
133-
dojo/dnd/TimedMoveable
13468
dojo/dom
13569
dojo/dom-attr
13670
dojo/dom-class
@@ -146,7 +80,6 @@ class Dojo < UrlScraper
14680
dojo/errors/RequestError
14781
dojo/errors/RequestTimeoutError
14882
dojo/Evented
149-
dojo/_firebug/firebug
15083
dojo/fx
15184
dojo/fx/easing
15285
dojo/fx.easing
@@ -168,44 +101,6 @@ class Dojo < UrlScraper
168101
dojo/keys
169102
dojo/loadInit
170103
dojo/main
171-
dojo/main.back
172-
dojo/main.cldr
173-
dojo/main.colors
174-
dojo/main.config
175-
dojo/main.contentHandlers
176-
dojo/main._contentHandlers
177-
dojo/main.currency
178-
dojo/main.data
179-
dojo/main.date
180-
dojo/main.dijit
181-
dojo/main.dnd
182-
dojo/main.doc
183-
dojo/main.dojox
184-
dojo/main.fx
185-
dojo/main.gears
186-
dojo/main.global
187-
dojo/main._hasResource
188-
dojo/main.html
189-
dojo/main.i18n
190-
dojo/main.io
191-
dojo/main.__IoArgs
192-
dojo/main.__IoCallbackArgs
193-
dojo/main.__IoPublish
194-
dojo/main.keys
195-
dojo/main.mouseButtons
196-
dojo/main._nodeDataCache
197-
dojo/main.number
198-
dojo/main.regexp
199-
dojo/main.rpc
200-
dojo/main.scopeMap
201-
dojo/main.Stateful
202-
dojo/main.store
203-
dojo/main.string
204-
dojo/main.tests
205-
dojo/main.touch
206-
dojo/main.version
207-
dojo/main.window
208-
dojo/main.__XhrArgs
209104
dojo/mouse
210105
dojo/node
211106
dojo/NodeList
@@ -217,12 +112,6 @@ class Dojo < UrlScraper
217112
dojo/NodeList._nodeDataCache
218113
dojo/NodeList-traverse
219114
dojo/number
220-
dojo/number.__FormatAbsoluteOptions
221-
dojo/number.__FormatOptions
222-
dojo/number.__IntegerRegexpFlags
223-
dojo/number.__ParseOptions
224-
dojo/number.__RealNumberRegexpFlags
225-
dojo/number.__RegexpOptions
226115
dojo/on
227116
dojo/on/asyncEventListener
228117
dojo/on/debounce
@@ -237,32 +126,16 @@ class Dojo < UrlScraper
237126
dojo/ready
238127
dojo/regexp
239128
dojo/request
240-
dojo/request.__BaseOptions
241129
dojo/request/default
242130
dojo/request/handlers
243131
dojo/request/iframe
244-
dojo/request/iframe.__BaseOptions
245-
dojo/request/iframe.__MethodOptions
246-
dojo/request/iframe.__Options
247-
dojo/request.__MethodOptions
248132
dojo/request/node
249-
dojo/request/node.__BaseOptions
250-
dojo/request/node.__MethodOptions
251-
dojo/request/node.__Options
252133
dojo/request/notify
253-
dojo/request.__Options
254-
dojo/request.__Promise
255134
dojo/request/registry
256135
dojo/request/script
257-
dojo/request/script.__BaseOptions
258-
dojo/request/script.__MethodOptions
259-
dojo/request/script.__Options
260136
dojo/request/util
261137
dojo/request/watch
262138
dojo/request/xhr
263-
dojo/request/xhr.__BaseOptions
264-
dojo/request/xhr.__MethodOptions
265-
dojo/request/xhr.__Options
266139
dojo/require
267140
dojo/robot
268141
dojo/robot._runsemaphore
@@ -299,14 +172,23 @@ class Dojo < UrlScraper
299172
dojo/uacss
300173
dojo/when
301174
dojo/window)
175+
# Add the rest of the url to the path
176+
self.initial_paths = self.initial_paths.map { |l| l + ".html?xhr=true" }
177+
# Dojo expects all the requests to be xhrs or it redirects you back to the docs home page
178+
# where it uses js to call the backend based on the URL so you get the appropriate documentation
179+
self.headers = { 'User-Agent' => 'devdocs.io' , 'X-Requested-With' => 'XMLHttpRequest' }
302180
self.links = {
303181
home: 'http://dojotoolkit.org',
304182
code: 'https://github.com/dojo/dojo'
305183
}
306184

307185
html_filters.push 'dojo/clean_html', 'dojo/entries'
308186

187+
# Don't use default selector on xhrs as no body or html document exists
188+
options[:container] = ->(filter) { filter.root_page? ? '#content' : false }
189+
options[:follow_links] = false
309190
options[:skip_links] = true
191+
options[:only] = self.initial_paths
310192

311193
options[:attribution] = <<-HTML
312194
The Dojo Toolkit is Copyright &copy; 2005&ndash;2013 <br>

public/icons/docs/dojo/16.png

672 Bytes
Loading
1.66 KB
Loading

test/lib/docs/core/scrapers/url_scraper_test.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,24 @@ class Scraper < Docs::UrlScraper
5858
result
5959
end
6060

61+
it "runs a Requester with .headers as :request_options" do
62+
stub(Scraper).headers { { testheader: true } }
63+
mock(Docs::Requester).run anything, satisfy { |options| options[:request_options][:headers][:testheader] }
64+
result
65+
end
66+
67+
it "runs a Requester with default .headers as :request_options" do
68+
mock(Docs::Requester).run anything, satisfy { |options| options[:request_options][:headers]["User-Agent"] }
69+
result
70+
end
71+
6172
it "runs a Requester with .params as :request_options" do
6273
stub(Scraper).params { { test: true } }
6374
mock(Docs::Requester).run anything, satisfy { |options| options[:request_options][:params][:test] }
6475
result
6576
end
6677

78+
6779
it "runs a Requester with the given block" do
6880
stub(Docs::Requester).run { |*args| @block = args.last }
6981
result

0 commit comments

Comments
 (0)