Skip to content

Commit 9397c11

Browse files
committed
Merge branch 'master' of github.com:stewartmckee/cobweb
2 parents 0de00db + f9bbce2 commit 9397c11

File tree

3 files changed

+24
-8
lines changed

3 files changed

+24
-8
lines changed

README.textile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ Creates a new crawler object based on a base_url
9595

9696
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
9797
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
98-
** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
98+
** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
99+
** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
99100
** :debug - enables debug output (Default: false)
100101
** :quiet - hides default output (Default: false)
101102
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
@@ -110,9 +111,12 @@ Creates a new crawler object based on a base_url
110111
** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
111112
** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
112113
** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
113-
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
114+
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
114115
** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
115116
** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
117+
** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
118+
** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
119+
** :proxy_port - port number of the proxy (default: nil)
116120

117121

118122
bc. crawler = Cobweb.new(:follow_redirects => false)

lib/cobweb.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ def initialize(options = {})
6060
default_valid_mime_types_to ["*/*"]
6161
default_raise_exceptions_to false
6262
default_store_inbound_links_to false
63+
default_proxy_addr_to nil
64+
default_proxy_port_to nil
6365

6466
end
6567

@@ -154,7 +156,7 @@ def get(url, options = @options)
154156
# retrieve data
155157
#unless @http && @http.address == uri.host && @http.port == uri.inferred_port
156158
puts "Creating connection to #{uri.host}..." if @options[:debug]
157-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
159+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
158160
#end
159161
if uri.scheme == "https"
160162
@http.use_ssl = true
@@ -327,7 +329,7 @@ def head(url, options = @options)
327329
# retrieve data
328330
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
329331
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
330-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
332+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
331333
end
332334
if uri.scheme == "https"
333335
@http.use_ssl = true

spec/cobweb/cobweb_spec.rb

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
options[:timeout].should == 10
3232
options[:redis_options].should == {}
3333
options[:internal_urls].should == []
34+
options[:proxy_addr].should be_nil
35+
options[:proxy_port].should be_nil
3436

3537
end
3638

@@ -177,26 +179,34 @@
177179
end
178180
describe "location setting" do
179181
it "Get should strip fragments" do
180-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
182+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
181183
Net::HTTP::Get.should_receive(:new).with("/", @default_options)
182184
@cobweb.get("http://www.google.com/#ignore")
183185
end
184186
it "head should strip fragments" do
185-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
187+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
186188
Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
187189
@cobweb.head("http://www.google.com/#ignore")
188190
end
189191
it "get should not strip path" do
190-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
192+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
191193
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
192194
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
193195
end
194196
it "get should not strip query string" do
195-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
197+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
196198
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
197199
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
198200
end
199201
end
202+
describe "with proxy" do
203+
it "provides proxy parameters to Net::HTTP" do
204+
cobweb = Cobweb.new proxy_addr: 'proxy.example.com', proxy_port: 1234
205+
Net::HTTP.should_receive(:new).with("www.google.com", 80, "proxy.example.com", 1234)
206+
207+
cobweb.get("http://www.google.com/")
208+
end
209+
end
200210

201211
end
202212
end

0 commit comments

Comments
 (0)