7 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9 # Lesser General Public License for more details. |
9 # Lesser General Public License for more details. |
10 # |
10 # |
11 # You should have received a copy of the GNU Lesser General Public |
11 # You should have received a copy of the GNU Lesser General Public |
12 # License along with this library; if not, write to the |
12 # License along with this library; if not, write to the |
13 # Free Software Foundation, Inc., |
13 # Free Software Foundation, Inc., |
14 # 59 Temple Place, Suite 330, |
14 # 59 Temple Place, Suite 330, |
15 # Boston, MA 02111-1307 USA |
15 # Boston, MA 02111-1307 USA |
16 |
16 |
17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber |
17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber |
18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko |
18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko |
19 |
19 |
22 >>> import urllib2 |
22 >>> import urllib2 |
23 >>> from keepalive import HTTPHandler |
23 >>> from keepalive import HTTPHandler |
24 >>> keepalive_handler = HTTPHandler() |
24 >>> keepalive_handler = HTTPHandler() |
25 >>> opener = urllib2.build_opener(keepalive_handler) |
25 >>> opener = urllib2.build_opener(keepalive_handler) |
26 >>> urllib2.install_opener(opener) |
26 >>> urllib2.install_opener(opener) |
27 >>> |
27 >>> |
28 >>> fo = urllib2.urlopen('http://www.python.org') |
28 >>> fo = urllib2.urlopen('http://www.python.org') |
29 |
29 |
30 If a connection to a given host is requested, and all of the existing |
30 If a connection to a given host is requested, and all of the existing |
31 connections are still in use, another connection will be opened. If |
31 connections are still in use, another connection will be opened. If |
32 the handler tries to use an existing connection but it fails in some |
32 the handler tries to use an existing connection but it fails in some |
109 DEBUG = None |
109 DEBUG = None |
110 |
110 |
111 import sys |
111 import sys |
112 if sys.version_info < (2, 4): HANDLE_ERRORS = 1 |
112 if sys.version_info < (2, 4): HANDLE_ERRORS = 1 |
113 else: HANDLE_ERRORS = 0 |
113 else: HANDLE_ERRORS = 0 |
114 |
114 |
115 class ConnectionManager: |
115 class ConnectionManager: |
116 """ |
116 """ |
117 The connection manager must be able to: |
117 The connection manager must be able to: |
118 * keep track of all existing |
118 * keep track of all existing |
119 """ |
119 """ |
149 self._lock.release() |
149 self._lock.release() |
150 |
150 |
151 def set_ready(self, connection, ready): |
151 def set_ready(self, connection, ready): |
152 try: self._readymap[connection] = ready |
152 try: self._readymap[connection] = ready |
153 except KeyError: pass |
153 except KeyError: pass |
154 |
154 |
155 def get_ready_conn(self, host): |
155 def get_ready_conn(self, host): |
156 conn = None |
156 conn = None |
157 self._lock.acquire() |
157 self._lock.acquire() |
158 try: |
158 try: |
159 if self._hostmap.has_key(host): |
159 if self._hostmap.has_key(host): |
173 return dict(self._hostmap) |
173 return dict(self._hostmap) |
174 |
174 |
175 class HTTPHandler(urllib2.HTTPHandler): |
175 class HTTPHandler(urllib2.HTTPHandler): |
176 def __init__(self): |
176 def __init__(self): |
177 self._cm = ConnectionManager() |
177 self._cm = ConnectionManager() |
178 |
178 |
179 #### Connection Management |
179 #### Connection Management |
180 def open_connections(self): |
180 def open_connections(self): |
181 """return a list of connected hosts and the number of connections |
181 """return a list of connected hosts and the number of connections |
182 to each. [('foo.com:80', 2), ('bar.org', 1)]""" |
182 to each. [('foo.com:80', 2), ('bar.org', 1)]""" |
183 return [(host, len(li)) for (host, li) in self._cm.get_all().items()] |
183 return [(host, len(li)) for (host, li) in self._cm.get_all().items()] |
187 host is the host:port spec, as in 'www.cnn.com:8080' as passed in. |
187 host is the host:port spec, as in 'www.cnn.com:8080' as passed in. |
188 no error occurs if there is no connection to that host.""" |
188 no error occurs if there is no connection to that host.""" |
189 for h in self._cm.get_all(host): |
189 for h in self._cm.get_all(host): |
190 self._cm.remove(h) |
190 self._cm.remove(h) |
191 h.close() |
191 h.close() |
192 |
192 |
193 def close_all(self): |
193 def close_all(self): |
194 """close all open connections""" |
194 """close all open connections""" |
195 for host, conns in self._cm.get_all().items(): |
195 for host, conns in self._cm.get_all().items(): |
196 for h in conns: |
196 for h in conns: |
197 self._cm.remove(h) |
197 self._cm.remove(h) |
198 h.close() |
198 h.close() |
199 |
199 |
200 def _request_closed(self, request, host, connection): |
200 def _request_closed(self, request, host, connection): |
201 """tells us that this request is now closed and the the |
201 """tells us that this request is now closed and the the |
202 connection is ready for another request""" |
202 connection is ready for another request""" |
203 self._cm.set_ready(connection, 1) |
203 self._cm.set_ready(connection, 1) |
204 |
204 |
205 def _remove_connection(self, host, connection, close=0): |
205 def _remove_connection(self, host, connection, close=0): |
206 if close: connection.close() |
206 if close: connection.close() |
207 self._cm.remove(connection) |
207 self._cm.remove(connection) |
208 |
208 |
209 #### Transaction Execution |
209 #### Transaction Execution |
210 def http_open(self, req): |
210 def http_open(self, req): |
211 return self.do_open(HTTPConnection, req) |
211 return self.do_open(HTTPConnection, req) |
212 |
212 |
213 def do_open(self, http_class, req): |
213 def do_open(self, http_class, req): |
237 self._cm.add(host, h, 0) |
237 self._cm.add(host, h, 0) |
238 self._start_transaction(h, req) |
238 self._start_transaction(h, req) |
239 r = h.getresponse() |
239 r = h.getresponse() |
240 except (socket.error, httplib.HTTPException), err: |
240 except (socket.error, httplib.HTTPException), err: |
241 raise urllib2.URLError(err) |
241 raise urllib2.URLError(err) |
242 |
242 |
243 # if not a persistent connection, don't try to reuse it |
243 # if not a persistent connection, don't try to reuse it |
244 if r.will_close: self._cm.remove(h) |
244 if r.will_close: self._cm.remove(h) |
245 |
245 |
246 if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason) |
246 if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason) |
247 r._handler = self |
247 r._handler = self |
249 r._url = req.get_full_url() |
249 r._url = req.get_full_url() |
250 r._connection = h |
250 r._connection = h |
251 r.code = r.status |
251 r.code = r.status |
252 r.headers = r.msg |
252 r.headers = r.msg |
253 r.msg = r.reason |
253 r.msg = r.reason |
254 |
254 |
255 if r.status == 200 or not HANDLE_ERRORS: |
255 if r.status == 200 or not HANDLE_ERRORS: |
256 return r |
256 return r |
257 else: |
257 else: |
258 return self.parent.error('http', req, r, |
258 return self.parent.error('http', req, r, |
259 r.status, r.msg, r.headers) |
259 r.status, r.msg, r.headers) |
285 if DEBUG: DEBUG.error("unexpected exception - closing " + \ |
285 if DEBUG: DEBUG.error("unexpected exception - closing " + \ |
286 "connection to %s (%d)", host, id(h)) |
286 "connection to %s (%d)", host, id(h)) |
287 self._cm.remove(h) |
287 self._cm.remove(h) |
288 h.close() |
288 h.close() |
289 raise |
289 raise |
290 |
290 |
291 if r is None or r.version == 9: |
291 if r is None or r.version == 9: |
292 # httplib falls back to assuming HTTP 0.9 if it gets a |
292 # httplib falls back to assuming HTTP 0.9 if it gets a |
293 # bad header back. This is most likely to happen if |
293 # bad header back. This is most likely to happen if |
294 # the socket has been closed by the server since we |
294 # the socket has been closed by the server since we |
295 # last used the connection. |
295 # last used the connection. |
340 |
340 |
341 # the read method wraps the original to accomodate buffering, |
341 # the read method wraps the original to accomodate buffering, |
342 # although read() never adds to the buffer. |
342 # although read() never adds to the buffer. |
343 # Both readline and readlines have been stolen with almost no |
343 # Both readline and readlines have been stolen with almost no |
344 # modification from socket.py |
344 # modification from socket.py |
345 |
345 |
346 |
346 |
347 def __init__(self, sock, debuglevel=0, strict=0, method=None): |
347 def __init__(self, sock, debuglevel=0, strict=0, method=None): |
348 if method: # the httplib in python 2.3 uses the method arg |
348 if method: # the httplib in python 2.3 uses the method arg |
349 httplib.HTTPResponse.__init__(self, sock, debuglevel, method) |
349 httplib.HTTPResponse.__init__(self, sock, debuglevel, method) |
350 else: # 2.2 doesn't |
350 else: # 2.2 doesn't |
423 |
423 |
424 |
424 |
425 class HTTPConnection(httplib.HTTPConnection): |
425 class HTTPConnection(httplib.HTTPConnection): |
426 # use the modified response class |
426 # use the modified response class |
427 response_class = HTTPResponse |
427 response_class = HTTPResponse |
428 |
428 |
429 ######################################################################### |
429 ######################################################################### |
430 ##### TEST FUNCTIONS |
430 ##### TEST FUNCTIONS |
431 ######################################################################### |
431 ######################################################################### |
432 |
432 |
433 def error_handler(url): |
433 def error_handler(url): |
457 keepalive_handler.close_all() |
457 keepalive_handler.close_all() |
458 |
458 |
459 def continuity(url): |
459 def continuity(url): |
460 import md5 |
460 import md5 |
461 format = '%25s: %s' |
461 format = '%25s: %s' |
462 |
462 |
463 # first fetch the file with the normal http handler |
463 # first fetch the file with the normal http handler |
464 opener = urllib2.build_opener() |
464 opener = urllib2.build_opener() |
465 urllib2.install_opener(opener) |
465 urllib2.install_opener(opener) |
466 fo = urllib2.urlopen(url) |
466 fo = urllib2.urlopen(url) |
467 foo = fo.read() |
467 foo = fo.read() |
504 opener = urllib2.build_opener(HTTPHandler()) |
504 opener = urllib2.build_opener(HTTPHandler()) |
505 urllib2.install_opener(opener) |
505 urllib2.install_opener(opener) |
506 t2 = fetch(N, url) |
506 t2 = fetch(N, url) |
507 print ' TIME: %.3f s' % t2 |
507 print ' TIME: %.3f s' % t2 |
508 print ' improvement factor: %.2f' % (t1/t2, ) |
508 print ' improvement factor: %.2f' % (t1/t2, ) |
509 |
509 |
510 def fetch(N, url, delay=0): |
510 def fetch(N, url, delay=0): |
511 import time |
511 import time |
512 lens = [] |
512 lens = [] |
513 starttime = time.time() |
513 starttime = time.time() |
514 for i in range(N): |
514 for i in range(N): |