Skip to content

Commit 018628e

Browse files
committed
Add two-pass redirection rewriter
... to avoid having to maintain huge lists of redirects. This works by doing a first pass to detect which internal URL is redirected where, before doing a second (normal) pass that rewrites all these URLs (links) with their final destination. There's a bit of monkey-patching I'm not proud of, but this works(tm).
1 parent 87763ac commit 018628e

5 files changed

Lines changed: 128 additions & 12 deletions

File tree

lib/docs/core/scraper.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def options
100100
(options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
101101
end
102102

103+
options.merge!(additional_options) if respond_to?(:additional_options, true)
103104
options.freeze
104105
end
105106
end

lib/docs/core/scrapers/url_scraper.rb

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,66 @@ def request_options
2828
def process_response?(response)
2929
response.success? && response.html? && base_url.contains?(response.effective_url)
3030
end
31+
32+
module FixRedirectionsBehavior
33+
def self.included(base)
34+
base.extend ClassMethods
35+
end
36+
37+
module ClassMethods
38+
attr_accessor :fix_redirections
39+
attr_reader :redirections
40+
41+
def store_pages(store)
42+
return super unless fix_redirections
43+
instrument 'info.doc', msg: 'Fetching redirections...'
44+
with_redirections do
45+
instrument 'info.doc', msg: 'Building pages...'
46+
super
47+
end
48+
end
49+
50+
private
51+
52+
def with_redirections
53+
@redirections = new.fetch_redirections
54+
yield
55+
ensure
56+
@redirections = nil
57+
end
58+
end
59+
60+
def fetch_redirections
61+
result = {}
62+
with_filters 'container', 'normalize_urls', 'internal_urls' do
63+
build_pages do |page|
64+
next if page[:response_effective_path] == page[:response_path]
65+
result[page[:response_path].downcase] = page[:response_effective_path]
66+
end
67+
end
68+
result
69+
end
70+
71+
private
72+
73+
def process_response(response)
74+
super.merge! response_effective_path: response.effective_path, response_path: response.path
75+
end
76+
77+
def additional_options
78+
{ redirections: self.class.redirections }
79+
end
80+
81+
def with_filters(*filters)
82+
stack = FilterStack.new
83+
stack.push(*filters)
84+
pipeline.instance_variable_set :@filters, stack.to_a.freeze
85+
yield
86+
ensure
87+
@pipeline = nil
88+
end
89+
end
90+
91+
include FixRedirectionsBehavior
3192
end
3293
end

lib/docs/filters/core/normalize_urls.rb

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@ def update_attribute(tag, attribute)
1919

2020
def normalize_url(str)
2121
url = to_absolute_url(str)
22-
fix_url(url)
23-
fix_url_string(url.to_s)
22+
23+
while new_url = fix_url(url)
24+
url = new_url
25+
end
26+
27+
url.to_s
2428
rescue URI::InvalidURIError
2529
'#'
2630
end
@@ -31,18 +35,40 @@ def to_absolute_url(str)
3135
end
3236

3337
def fix_url(url)
34-
return unless context[:replace_paths]
35-
path = subpath_to(url)
38+
if context[:redirections]
39+
url = URL.parse(url)
40+
path = url.path.downcase
3641

37-
if context[:replace_paths].has_key?(path)
38-
url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path]
42+
if context[:redirections].key?(path)
43+
url.path = context[:redirections][path]
44+
return url
45+
end
3946
end
40-
end
4147

42-
def fix_url_string(str)
43-
str = context[:replace_urls][str] || str if context[:replace_urls]
44-
str = context[:fix_urls].call(str) || str if context[:fix_urls]
45-
str
48+
if context[:replace_paths]
49+
url = URL.parse(url)
50+
path = subpath_to(url)
51+
52+
if context[:replace_paths].key?(path)
53+
url.path = url.path.sub %r[#{path}\z], context[:replace_paths][path]
54+
return url
55+
end
56+
end
57+
58+
if context[:replace_urls]
59+
url = url.to_s
60+
61+
if context[:replace_urls].key?(url)
62+
return context[:replace_urls][url]
63+
end
64+
end
65+
66+
if context[:fix_urls]
67+
url = url.to_s
68+
orig_url = url.dup
69+
new_url = context[:fix_urls].call(url)
70+
return new_url if new_url != orig_url
71+
end
4672
end
4773
end
4874
end

lib/docs/subscribers/doc_subscriber.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ def db(event)
1616
log_diff before.keys, after.keys
1717
end
1818

19+
def info(event)
20+
log event.payload[:msg]
21+
end
22+
1923
private
2024

2125
def parse_payload(event)

test/lib/docs/filters/core/normalize_urls_test.rb

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class NormalizeUrlsFilterTest < MiniTest::Spec
116116
end
117117

118118
it "calls the block with each absolute url" do
119-
context[:fix_urls] = ->(arg) { (@args ||= []).push(arg) }
119+
context[:fix_urls] = ->(arg) { (@args ||= []).push(arg); nil }
120120
@body += link_to '/path?#'
121121
filter.call
122122
assert_equal ['http://example.com/path?#'] * 2, @args
@@ -139,4 +139,28 @@ class NormalizeUrlsFilterTest < MiniTest::Spec
139139
refute @called
140140
end
141141
end
142+
143+
context "when context[:redirections] is a hash" do
144+
before do
145+
@body = link_to 'http://example.com/path?query#frag'
146+
end
147+
148+
it "replaces the path of matching urls, case-insensitive" do
149+
@body = link_to('http://example.com/PATH?query#frag') + link_to('http://example.com/path/two')
150+
context[:redirections] = { '/path' => '/fixed' }
151+
expected = link_to('http://example.com/fixed?query#frag') + link_to('http://example.com/path/two')
152+
assert_equal expected, filter_output_string
153+
end
154+
155+
it "does a multi pass with context[:fix_urls]" do
156+
@body = link_to('http://example.com/path')
157+
context[:fix_urls] = ->(url) do
158+
url.sub! 'example.com', 'example.org'
159+
url.sub! '/Fixed', '/fixed'
160+
url
161+
end
162+
context[:redirections] = { '/path' => '/Fixed' }
163+
assert_equal link_to('http://example.org/fixed'), filter_output_string
164+
end
165+
end
142166
end

0 commit comments

Comments
 (0)