summaryrefslogtreecommitdiffstats
path: root/scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py')
-rw-r--r--scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py1477
1 files changed, 1477 insertions, 0 deletions
diff --git a/scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py b/scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py
new file mode 100644
index 0000000000..fefdab36f6
--- /dev/null
+++ b/scripts/lib/mic/3rdparty/pykickstart/urlgrabber/grabber.py
@@ -0,0 +1,1477 @@
1# This library is free software; you can redistribute it and/or
2# modify it under the terms of the GNU Lesser General Public
3# License as published by the Free Software Foundation; either
4# version 2.1 of the License, or (at your option) any later version.
5#
6# This library is distributed in the hope that it will be useful,
7# but WITHOUT ANY WARRANTY; without even the implied warranty of
8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9# Lesser General Public License for more details.
10#
11# You should have received a copy of the GNU Lesser General Public
12# License along with this library; if not, write to the
13# Free Software Foundation, Inc.,
14# 59 Temple Place, Suite 330,
15# Boston, MA 02111-1307 USA
16
17# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19
20"""A high-level cross-protocol url-grabber.
21
22GENERAL ARGUMENTS (kwargs)
23
24 Where possible, the module-level default is indicated, and legal
25 values are provided.
26
27 copy_local = 0 [0|1]
28
29 ignored except for file:// urls, in which case it specifies
30 whether urlgrab should still make a copy of the file, or simply
31 point to the existing copy. The module level default for this
32 option is 0.
33
34 close_connection = 0 [0|1]
35
36 tells URLGrabber to close the connection after a file has been
37 transfered. This is ignored unless the download happens with the
38 http keepalive handler (keepalive=1). Otherwise, the connection
39 is left open for further use. The module level default for this
40 option is 0 (keepalive connections will not be closed).
41
42 keepalive = 1 [0|1]
43
44 specifies whether keepalive should be used for HTTP/1.1 servers
45 that support it. The module level default for this option is 1
46 (keepalive is enabled).
47
48 progress_obj = None
49
50 a class instance that supports the following methods:
51 po.start(filename, url, basename, length, text)
52 # length will be None if unknown
53 po.update(read) # read == bytes read so far
54 po.end()
55
56 text = None
57
58 specifies an alternativ text item in the beginning of the progress
59 bar line. If not given, the basename of the file is used.
60
61 throttle = 1.0
62
63 a number - if it's an int, it's the bytes/second throttle limit.
64 If it's a float, it is first multiplied by bandwidth. If throttle
65 == 0, throttling is disabled. If None, the module-level default
66 (which can be set on default_grabber.throttle) is used. See
67 BANDWIDTH THROTTLING for more information.
68
69 timeout = None
70
71 a positive float expressing the number of seconds to wait for socket
72 operations. If the value is None or 0.0, socket operations will block
73 forever. Setting this option causes urlgrabber to call the settimeout
74 method on the Socket object used for the request. See the Python
75 documentation on settimeout for more information.
76 http://www.python.org/doc/current/lib/socket-objects.html
77
78 bandwidth = 0
79
80 the nominal max bandwidth in bytes/second. If throttle is a float
81 and bandwidth == 0, throttling is disabled. If None, the
82 module-level default (which can be set on
83 default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
84 more information.
85
86 range = None
87
88 a tuple of the form (first_byte, last_byte) describing a byte
89 range to retrieve. Either or both of the values may set to
90 None. If first_byte is None, byte offset 0 is assumed. If
91 last_byte is None, the last byte available is assumed. Note that
92 the range specification is python-like in that (0,10) will yeild
93 the first 10 bytes of the file.
94
95 If set to None, no range will be used.
96
97 reget = None [None|'simple'|'check_timestamp']
98
99 whether to attempt to reget a partially-downloaded file. Reget
100 only applies to .urlgrab and (obviously) only if there is a
101 partially downloaded file. Reget has two modes:
102
103 'simple' -- the local file will always be trusted. If there
104 are 100 bytes in the local file, then the download will always
105 begin 100 bytes into the requested file.
106
107 'check_timestamp' -- the timestamp of the server file will be
108 compared to the timestamp of the local file. ONLY if the
109 local file is newer than or the same age as the server file
110 will reget be used. If the server file is newer, or the
111 timestamp is not returned, the entire file will be fetched.
112
113 NOTE: urlgrabber can do very little to verify that the partial
114 file on disk is identical to the beginning of the remote file.
115 You may want to either employ a custom "checkfunc" or simply avoid
116 using reget in situations where corruption is a concern.
117
118 user_agent = 'urlgrabber/VERSION'
119
120 a string, usually of the form 'AGENT/VERSION' that is provided to
121 HTTP servers in the User-agent header. The module level default
122 for this option is "urlgrabber/VERSION".
123
124 http_headers = None
125
126 a tuple of 2-tuples, each containing a header and value. These
127 will be used for http and https requests only. For example, you
128 can do
129 http_headers = (('Pragma', 'no-cache'),)
130
131 ftp_headers = None
132
133 this is just like http_headers, but will be used for ftp requests.
134
135 proxies = None
136
137 a dictionary that maps protocol schemes to proxy hosts. For
138 example, to use a proxy server on host "foo" port 3128 for http
139 and https URLs:
140 proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
141 note that proxy authentication information may be provided using
142 normal URL constructs:
143 proxies={ 'http' : 'http://user:host@foo:3128' }
144 Lastly, if proxies is None, the default environment settings will
145 be used.
146
147 prefix = None
148
149 a url prefix that will be prepended to all requested urls. For
150 example:
151 g = URLGrabber(prefix='http://foo.com/mirror/')
152 g.urlgrab('some/file.txt')
153 ## this will fetch 'http://foo.com/mirror/some/file.txt'
154 This option exists primarily to allow identical behavior to
155 MirrorGroup (and derived) instances. Note: a '/' will be inserted
156 if necessary, so you cannot specify a prefix that ends with a
157 partial file or directory name.
158
159 opener = None
160
161 Overrides the default urllib2.OpenerDirector provided to urllib2
162 when making requests. This option exists so that the urllib2
163 handler chain may be customized. Note that the range, reget,
164 proxy, and keepalive features require that custom handlers be
165 provided to urllib2 in order to function properly. If an opener
166 option is provided, no attempt is made by urlgrabber to ensure
167 chain integrity. You are responsible for ensuring that any
168 extension handlers are present if said features are required.
169
170 data = None
171
172 Only relevant for the HTTP family (and ignored for other
173 protocols), this allows HTTP POSTs. When the data kwarg is
174 present (and not None), an HTTP request will automatically become
175 a POST rather than GET. This is done by direct passthrough to
176 urllib2. If you use this, you may also want to set the
177 'Content-length' and 'Content-type' headers with the http_headers
178 option. Note that python 2.2 handles the case of these
179 badly and if you do not use the proper case (shown here), your
180 values will be overridden with the defaults.
181
182
183RETRY RELATED ARGUMENTS
184
185 retry = None
186
187 the number of times to retry the grab before bailing. If this is
188 zero, it will retry forever. This was intentional... really, it
189 was :). If this value is not supplied or is supplied but is None
190 retrying does not occur.
191
192 retrycodes = [-1,2,4,5,6,7]
193
194 a sequence of errorcodes (values of e.errno) for which it should
195 retry. See the doc on URLGrabError for more details on this. You
196 might consider modifying a copy of the default codes rather than
197 building yours from scratch so that if the list is extended in the
198 future (or one code is split into two) you can still enjoy the
199 benefits of the default list. You can do that with something like
200 this:
201
202 retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
203 if 12 not in retrycodes:
204 retrycodes.append(12)
205
206 checkfunc = None
207
208 a function to do additional checks. This defaults to None, which
209 means no additional checking. The function should simply return
210 on a successful check. It should raise URLGrabError on an
211 unsuccessful check. Raising of any other exception will be
212 considered immediate failure and no retries will occur.
213
214 If it raises URLGrabError, the error code will determine the retry
215 behavior. Negative error numbers are reserved for use by these
216 passed in functions, so you can use many negative numbers for
217 different types of failure. By default, -1 results in a retry,
218 but this can be customized with retrycodes.
219
220 If you simply pass in a function, it will be given exactly one
221 argument: a CallbackObject instance with the .url attribute
222 defined and either .filename (for urlgrab) or .data (for urlread).
223 For urlgrab, .filename is the name of the local file. For
224 urlread, .data is the actual string data. If you need other
225 arguments passed to the callback (program state of some sort), you
226 can do so like this:
227
228 checkfunc=(function, ('arg1', 2), {'kwarg': 3})
229
230 if the downloaded file has filename /tmp/stuff, then this will
231 result in this call (for urlgrab):
232
233 function(obj, 'arg1', 2, kwarg=3)
234 # obj.filename = '/tmp/stuff'
235 # obj.url = 'http://foo.com/stuff'
236
237 NOTE: both the "args" tuple and "kwargs" dict must be present if
238 you use this syntax, but either (or both) can be empty.
239
240 failure_callback = None
241
242 The callback that gets called during retries when an attempt to
243 fetch a file fails. The syntax for specifying the callback is
244 identical to checkfunc, except for the attributes defined in the
245 CallbackObject instance. The attributes for failure_callback are:
246
247 exception = the raised exception
248 url = the url we're trying to fetch
249 tries = the number of tries so far (including this one)
250 retry = the value of the retry option
251
252 The callback is present primarily to inform the calling program of
253 the failure, but if it raises an exception (including the one it's
254 passed) that exception will NOT be caught and will therefore cause
255 future retries to be aborted.
256
257 The callback is called for EVERY failure, including the last one.
258 On the last try, the callback can raise an alternate exception,
259 but it cannot (without severe trickiness) prevent the exception
260 from being raised.
261
262 interrupt_callback = None
263
264 This callback is called if KeyboardInterrupt is received at any
265 point in the transfer. Basically, this callback can have three
266 impacts on the fetch process based on the way it exits:
267
268 1) raise no exception: the current fetch will be aborted, but
269 any further retries will still take place
270
271 2) raise a URLGrabError: if you're using a MirrorGroup, then
272 this will prompt a failover to the next mirror according to
273 the behavior of the MirrorGroup subclass. It is recommended
274 that you raise URLGrabError with code 15, 'user abort'. If
275 you are NOT using a MirrorGroup subclass, then this is the
276 same as (3).
277
278 3) raise some other exception (such as KeyboardInterrupt), which
279 will not be caught at either the grabber or mirror levels.
280 That is, it will be raised up all the way to the caller.
281
282 This callback is very similar to failure_callback. They are
283 passed the same arguments, so you could use the same function for
284 both.
285
286 urlparser = URLParser()
287
288 The URLParser class handles pre-processing of URLs, including
289 auth-handling for user/pass encoded in http urls, file handing
290 (that is, filenames not sent as a URL), and URL quoting. If you
291 want to override any of this behavior, you can pass in a
292 replacement instance. See also the 'quote' option.
293
294 quote = None
295
296 Whether or not to quote the path portion of a url.
297 quote = 1 -> quote the URLs (they're not quoted yet)
298 quote = 0 -> do not quote them (they're already quoted)
299 quote = None -> guess what to do
300
301 This option only affects proper urls like 'file:///etc/passwd'; it
302 does not affect 'raw' filenames like '/etc/passwd'. The latter
303 will always be quoted as they are converted to URLs. Also, only
304 the path part of a url is quoted. If you need more fine-grained
305 control, you should probably subclass URLParser and pass it in via
306 the 'urlparser' option.
307
308BANDWIDTH THROTTLING
309
310 urlgrabber supports throttling via two values: throttle and
311 bandwidth Between the two, you can either specify and absolute
312 throttle threshold or specify a theshold as a fraction of maximum
313 available bandwidth.
314
315 throttle is a number - if it's an int, it's the bytes/second
316 throttle limit. If it's a float, it is first multiplied by
317 bandwidth. If throttle == 0, throttling is disabled. If None, the
318 module-level default (which can be set with set_throttle) is used.
319
320 bandwidth is the nominal max bandwidth in bytes/second. If throttle
321 is a float and bandwidth == 0, throttling is disabled. If None, the
322 module-level default (which can be set with set_bandwidth) is used.
323
324 THROTTLING EXAMPLES:
325
326 Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
327 per second, or 12,500,000 Bytes per second. You have a number of
328 throttling options:
329
330 *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
331
332 This will limit urlgrab to use half of your available bandwidth.
333
334 *) set_throttle(6250000) # throttle is an int
335
336 This will also limit urlgrab to use half of your available
337 bandwidth, regardless of what bandwidth is set to.
338
339 *) set_throttle(6250000); set_throttle(1.0) # float
340
341 Use half your bandwidth
342
343 *) set_throttle(6250000); set_throttle(2.0) # float
344
345 Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
346
347 *) set_throttle(6250000); set_throttle(0) # throttle = 0
348
349 Disable throttling - this is more efficient than a very large
350 throttle setting.
351
352 *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
353
354 Disable throttling - this is the default when the module is loaded.
355
356 SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
357
358 While this is flexible, it's not extremely obvious to the user. I
359 suggest you implement a float throttle as a percent to make the
360 distinction between absolute and relative throttling very explicit.
361
362 Also, you may want to convert the units to something more convenient
363 than bytes/second, such as kbps or kB/s, etc.
364
365"""
366
367# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $
368
369import os
370import os.path
371import sys
372import urlparse
373import rfc822
374import time
375import string
376import urllib
377import urllib2
378from stat import * # S_* and ST_*
379
380########################################################################
381# MODULE INITIALIZATION
382########################################################################
383try:
384 exec('from ' + (__name__.split('.'))[0] + ' import __version__')
385except:
386 __version__ = '???'
387
388import sslfactory
389
390auth_handler = urllib2.HTTPBasicAuthHandler( \
391 urllib2.HTTPPasswordMgrWithDefaultRealm())
392
393try:
394 from i18n import _
395except ImportError, msg:
396 def _(st): return st
397
398try:
399 from httplib import HTTPException
400except ImportError, msg:
401 HTTPException = None
402
403try:
404 # This is a convenient way to make keepalive optional.
405 # Just rename the module so it can't be imported.
406 import keepalive
407 from keepalive import HTTPHandler, HTTPSHandler
408 have_keepalive = True
409except ImportError, msg:
410 have_keepalive = False
411
412try:
413 # add in range support conditionally too
414 import byterange
415 from byterange import HTTPRangeHandler, HTTPSRangeHandler, \
416 FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \
417 range_tuple_to_header, RangeError
418except ImportError, msg:
419 range_handlers = ()
420 RangeError = None
421 have_range = 0
422else:
423 range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(),
424 FileRangeHandler(), FTPRangeHandler())
425 have_range = 1
426
427
428# check whether socket timeout support is available (Python >= 2.3)
429import socket
430try:
431 TimeoutError = socket.timeout
432 have_socket_timeout = True
433except AttributeError:
434 TimeoutError = None
435 have_socket_timeout = False
436
437########################################################################
438# functions for debugging output. These functions are here because they
439# are also part of the module initialization.
440DEBUG = None
441def set_logger(DBOBJ):
442 """Set the DEBUG object. This is called by _init_default_logger when
443 the environment variable URLGRABBER_DEBUG is set, but can also be
444 called by a calling program. Basically, if the calling program uses
445 the logging module and would like to incorporate urlgrabber logging,
446 then it can do so this way. It's probably not necessary as most
447 internal logging is only for debugging purposes.
448
449 The passed-in object should be a logging.Logger instance. It will
450 be pushed into the keepalive and byterange modules if they're
451 being used. The mirror module pulls this object in on import, so
452 you will need to manually push into it. In fact, you may find it
453 tidier to simply push your logging object (or objects) into each
454 of these modules independently.
455 """
456
457 global DEBUG
458 DEBUG = DBOBJ
459 if have_keepalive and keepalive.DEBUG is None:
460 keepalive.DEBUG = DBOBJ
461 if have_range and byterange.DEBUG is None:
462 byterange.DEBUG = DBOBJ
463 if sslfactory.DEBUG is None:
464 sslfactory.DEBUG = DBOBJ
465
466def _init_default_logger():
467 '''Examines the environment variable URLGRABBER_DEBUG and creates
468 a logging object (logging.logger) based on the contents. It takes
469 the form
470
471 URLGRABBER_DEBUG=level,filename
472
473 where "level" can be either an integer or a log level from the
474 logging module (DEBUG, INFO, etc). If the integer is zero or
475 less, logging will be disabled. Filename is the filename where
476 logs will be sent. If it is "-", then stdout will be used. If
477 the filename is empty or missing, stderr will be used. If the
478 variable cannot be processed or the logging module cannot be
479 imported (python < 2.3) then logging will be disabled. Here are
480 some examples:
481
482 URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
483 URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
484 URLGRABBER_DEBUG=INFO # log info and higher to stderr
485
486 This funtion is called during module initialization. It is not
487 intended to be called from outside. The only reason it is a
488 function at all is to keep the module-level namespace tidy and to
489 collect the code into a nice block.'''
490
491 try:
492 dbinfo = os.environ['URLGRABBER_DEBUG'].split(',')
493 import logging
494 level = logging._levelNames.get(dbinfo[0], int(dbinfo[0]))
495 if level < 1: raise ValueError()
496
497 formatter = logging.Formatter('%(asctime)s %(message)s')
498 if len(dbinfo) > 1: filename = dbinfo[1]
499 else: filename = ''
500 if filename == '': handler = logging.StreamHandler(sys.stderr)
501 elif filename == '-': handler = logging.StreamHandler(sys.stdout)
502 else: handler = logging.FileHandler(filename)
503 handler.setFormatter(formatter)
504 DBOBJ = logging.getLogger('urlgrabber')
505 DBOBJ.addHandler(handler)
506 DBOBJ.setLevel(level)
507 except (KeyError, ImportError, ValueError):
508 DBOBJ = None
509 set_logger(DBOBJ)
510
511_init_default_logger()
512########################################################################
513# END MODULE INITIALIZATION
514########################################################################
515
516
517
518class URLGrabError(IOError):
519 """
520 URLGrabError error codes:
521
522 URLGrabber error codes (0 -- 255)
523 0 - everything looks good (you should never see this)
524 1 - malformed url
525 2 - local file doesn't exist
526 3 - request for non-file local file (dir, etc)
527 4 - IOError on fetch
528 5 - OSError on fetch
529 6 - no content length header when we expected one
530 7 - HTTPException
531 8 - Exceeded read limit (for urlread)
532 9 - Requested byte range not satisfiable.
533 10 - Byte range requested, but range support unavailable
534 11 - Illegal reget mode
535 12 - Socket timeout
536 13 - malformed proxy url
537 14 - HTTPError (includes .code and .exception attributes)
538 15 - user abort
539
540 MirrorGroup error codes (256 -- 511)
541 256 - No more mirrors left to try
542
543 Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
544 [ this range reserved for application-specific error codes ]
545
546 Retry codes (< 0)
547 -1 - retry the download, unknown reason
548
549 Note: to test which group a code is in, you can simply do integer
550 division by 256: e.errno / 256
551
552 Negative codes are reserved for use by functions passed in to
553 retrygrab with checkfunc. The value -1 is built in as a generic
554 retry code and is already included in the retrycodes list.
555 Therefore, you can create a custom check function that simply
556 returns -1 and the fetch will be re-tried. For more customized
557 retries, you can use other negative number and include them in
558 retry-codes. This is nice for outputting useful messages about
559 what failed.
560
561 You can use these error codes like so:
562 try: urlgrab(url)
563 except URLGrabError, e:
564 if e.errno == 3: ...
565 # or
566 print e.strerror
567 # or simply
568 print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
569 """
570 pass
571
572class CallbackObject:
573 """Container for returned callback data.
574
575 This is currently a dummy class into which urlgrabber can stuff
576 information for passing to callbacks. This way, the prototype for
577 all callbacks is the same, regardless of the data that will be
578 passed back. Any function that accepts a callback function as an
579 argument SHOULD document what it will define in this object.
580
581 It is possible that this class will have some greater
582 functionality in the future.
583 """
584 def __init__(self, **kwargs):
585 self.__dict__.update(kwargs)
586
587def urlgrab(url, filename=None, **kwargs):
588 """grab the file at <url> and make a local copy at <filename>
589 If filename is none, the basename of the url is used.
590 urlgrab returns the filename of the local file, which may be different
591 from the passed-in filename if the copy_local kwarg == 0.
592
593 See module documentation for a description of possible kwargs.
594 """
595 return default_grabber.urlgrab(url, filename, **kwargs)
596
597def urlopen(url, **kwargs):
598 """open the url and return a file object
599 If a progress object or throttle specifications exist, then
600 a special file object will be returned that supports them.
601 The file object can be treated like any other file object.
602
603 See module documentation for a description of possible kwargs.
604 """
605 return default_grabber.urlopen(url, **kwargs)
606
607def urlread(url, limit=None, **kwargs):
608 """read the url into a string, up to 'limit' bytes
609 If the limit is exceeded, an exception will be thrown. Note that urlread
610 is NOT intended to be used as a way of saying "I want the first N bytes"
611 but rather 'read the whole file into memory, but don't use too much'
612
613 See module documentation for a description of possible kwargs.
614 """
615 return default_grabber.urlread(url, limit, **kwargs)
616
617
618class URLParser:
619 """Process the URLs before passing them to urllib2.
620
621 This class does several things:
622
623 * add any prefix
624 * translate a "raw" file to a proper file: url
625 * handle any http or https auth that's encoded within the url
626 * quote the url
627
628 Only the "parse" method is called directly, and it calls sub-methods.
629
630 An instance of this class is held in the options object, which
631 means that it's easy to change the behavior by sub-classing and
632 passing the replacement in. It need only have a method like:
633
634 url, parts = urlparser.parse(url, opts)
635 """
636
637 def parse(self, url, opts):
638 """parse the url and return the (modified) url and its parts
639
640 Note: a raw file WILL be quoted when it's converted to a URL.
641 However, other urls (ones which come with a proper scheme) may
642 or may not be quoted according to opts.quote
643
644 opts.quote = 1 --> quote it
645 opts.quote = 0 --> do not quote it
646 opts.quote = None --> guess
647 """
648 quote = opts.quote
649
650 if opts.prefix:
651 url = self.add_prefix(url, opts.prefix)
652
653 parts = urlparse.urlparse(url)
654 (scheme, host, path, parm, query, frag) = parts
655
656 if not scheme or (len(scheme) == 1 and scheme in string.letters):
657 # if a scheme isn't specified, we guess that it's "file:"
658 if url[0] not in '/\\': url = os.path.abspath(url)
659 url = 'file:' + urllib.pathname2url(url)
660 parts = urlparse.urlparse(url)
661 quote = 0 # pathname2url quotes, so we won't do it again
662
663 if scheme in ['http', 'https']:
664 parts = self.process_http(parts)
665
666 if quote is None:
667 quote = self.guess_should_quote(parts)
668 if quote:
669 parts = self.quote(parts)
670
671 url = urlparse.urlunparse(parts)
672 return url, parts
673
674 def add_prefix(self, url, prefix):
675 if prefix[-1] == '/' or url[0] == '/':
676 url = prefix + url
677 else:
678 url = prefix + '/' + url
679 return url
680
681 def process_http(self, parts):
682 (scheme, host, path, parm, query, frag) = parts
683
684 if '@' in host and auth_handler:
685 try:
686 user_pass, host = host.split('@', 1)
687 if ':' in user_pass:
688 user, password = user_pass.split(':', 1)
689 except ValueError, e:
690 raise URLGrabError(1, _('Bad URL: %s') % url)
691 if DEBUG: DEBUG.info('adding HTTP auth: %s, XXXXXXXX', user)
692 auth_handler.add_password(None, host, user, password)
693
694 return (scheme, host, path, parm, query, frag)
695
696 def quote(self, parts):
697 """quote the URL
698
699 This method quotes ONLY the path part. If you need to quote
700 other parts, you should override this and pass in your derived
701 class. The other alternative is to quote other parts before
702 passing into urlgrabber.
703 """
704 (scheme, host, path, parm, query, frag) = parts
705 path = urllib.quote(path)
706 return (scheme, host, path, parm, query, frag)
707
708 hexvals = '0123456789ABCDEF'
709 def guess_should_quote(self, parts):
710 """
711 Guess whether we should quote a path. This amounts to
712 guessing whether it's already quoted.
713
714 find ' ' -> 1
715 find '%' -> 1
716 find '%XX' -> 0
717 else -> 1
718 """
719 (scheme, host, path, parm, query, frag) = parts
720 if ' ' in path:
721 return 1
722 ind = string.find(path, '%')
723 if ind > -1:
724 while ind > -1:
725 if len(path) < ind+3:
726 return 1
727 code = path[ind+1:ind+3].upper()
728 if code[0] not in self.hexvals or \
729 code[1] not in self.hexvals:
730 return 1
731 ind = string.find(path, '%', ind+1)
732 return 0
733 return 1
734
735class URLGrabberOptions:
736 """Class to ease kwargs handling."""
737
738 def __init__(self, delegate=None, **kwargs):
739 """Initialize URLGrabberOptions object.
740 Set default values for all options and then update options specified
741 in kwargs.
742 """
743 self.delegate = delegate
744 if delegate is None:
745 self._set_defaults()
746 self._set_attributes(**kwargs)
747
748 def __getattr__(self, name):
749 if self.delegate and hasattr(self.delegate, name):
750 return getattr(self.delegate, name)
751 raise AttributeError, name
752
753 def raw_throttle(self):
754 """Calculate raw throttle value from throttle and bandwidth
755 values.
756 """
757 if self.throttle <= 0:
758 return 0
759 elif type(self.throttle) == type(0):
760 return float(self.throttle)
761 else: # throttle is a float
762 return self.bandwidth * self.throttle
763
764 def derive(self, **kwargs):
765 """Create a derived URLGrabberOptions instance.
766 This method creates a new instance and overrides the
767 options specified in kwargs.
768 """
769 return URLGrabberOptions(delegate=self, **kwargs)
770
771 def _set_attributes(self, **kwargs):
772 """Update object attributes with those provided in kwargs."""
773 self.__dict__.update(kwargs)
774 if have_range and kwargs.has_key('range'):
775 # normalize the supplied range value
776 self.range = range_tuple_normalize(self.range)
777 if not self.reget in [None, 'simple', 'check_timestamp']:
778 raise URLGrabError(11, _('Illegal reget mode: %s') \
779 % (self.reget, ))
780
781 def _set_defaults(self):
782 """Set all options to their default values.
783 When adding new options, make sure a default is
784 provided here.
785 """
786 self.progress_obj = None
787 self.throttle = 1.0
788 self.bandwidth = 0
789 self.retry = None
790 self.retrycodes = [-1,2,4,5,6,7]
791 self.checkfunc = None
792 self.copy_local = 0
793 self.close_connection = 0
794 self.range = None
795 self.user_agent = 'urlgrabber/%s' % __version__
796 self.keepalive = 1
797 self.proxies = None
798 self.reget = None
799 self.failure_callback = None
800 self.interrupt_callback = None
801 self.prefix = None
802 self.opener = None
803 self.cache_openers = True
804 self.timeout = None
805 self.text = None
806 self.http_headers = None
807 self.ftp_headers = None
808 self.data = None
809 self.urlparser = URLParser()
810 self.quote = None
811 self.ssl_ca_cert = None
812 self.ssl_context = None
813
814class URLGrabber:
815 """Provides easy opening of URLs with a variety of options.
816
817 All options are specified as kwargs. Options may be specified when
818 the class is created and may be overridden on a per request basis.
819
820 New objects inherit default values from default_grabber.
821 """
822
823 def __init__(self, **kwargs):
824 self.opts = URLGrabberOptions(**kwargs)
825
826 def _retry(self, opts, func, *args):
827 tries = 0
828 while 1:
829 # there are only two ways out of this loop. The second has
830 # several "sub-ways"
831 # 1) via the return in the "try" block
832 # 2) by some exception being raised
833 # a) an excepton is raised that we don't "except"
834 # b) a callback raises ANY exception
835 # c) we're not retry-ing or have run out of retries
836 # d) the URLGrabError code is not in retrycodes
837 # beware of infinite loops :)
838 tries = tries + 1
839 exception = None
840 retrycode = None
841 callback = None
842 if DEBUG: DEBUG.info('attempt %i/%s: %s',
843 tries, opts.retry, args[0])
844 try:
845 r = apply(func, (opts,) + args, {})
846 if DEBUG: DEBUG.info('success')
847 return r
848 except URLGrabError, e:
849 exception = e
850 callback = opts.failure_callback
851 retrycode = e.errno
852 except KeyboardInterrupt, e:
853 exception = e
854 callback = opts.interrupt_callback
855
856 if DEBUG: DEBUG.info('exception: %s', exception)
857 if callback:
858 if DEBUG: DEBUG.info('calling callback: %s', callback)
859 cb_func, cb_args, cb_kwargs = self._make_callback(callback)
860 obj = CallbackObject(exception=exception, url=args[0],
861 tries=tries, retry=opts.retry)
862 cb_func(obj, *cb_args, **cb_kwargs)
863
864 if (opts.retry is None) or (tries == opts.retry):
865 if DEBUG: DEBUG.info('retries exceeded, re-raising')
866 raise
867
868 if (retrycode is not None) and (retrycode not in opts.retrycodes):
869 if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
870 retrycode, opts.retrycodes)
871 raise
872
873 def urlopen(self, url, **kwargs):
874 """open the url and return a file object
875 If a progress object or throttle value specified when this
876 object was created, then a special file object will be
877 returned that supports them. The file object can be treated
878 like any other file object.
879 """
880 opts = self.opts.derive(**kwargs)
881 (url,parts) = opts.urlparser.parse(url, opts)
882 def retryfunc(opts, url):
883 return URLGrabberFileObject(url, filename=None, opts=opts)
884 return self._retry(opts, retryfunc, url)
885
886 def urlgrab(self, url, filename=None, **kwargs):
887 """grab the file at <url> and make a local copy at <filename>
888 If filename is none, the basename of the url is used.
889 urlgrab returns the filename of the local file, which may be
890 different from the passed-in filename if copy_local == 0.
891 """
892 opts = self.opts.derive(**kwargs)
893 (url,parts) = opts.urlparser.parse(url, opts)
894 (scheme, host, path, parm, query, frag) = parts
895 if filename is None:
896 filename = os.path.basename( urllib.unquote(path) )
897 if scheme == 'file' and not opts.copy_local:
898 # just return the name of the local file - don't make a
899 # copy currently
900 path = urllib.url2pathname(path)
901 if host:
902 path = os.path.normpath('//' + host + path)
903 if not os.path.exists(path):
904 raise URLGrabError(2,
905 _('Local file does not exist: %s') % (path, ))
906 elif not os.path.isfile(path):
907 raise URLGrabError(3,
908 _('Not a normal file: %s') % (path, ))
909 elif not opts.range:
910 return path
911
912 def retryfunc(opts, url, filename):
913 fo = URLGrabberFileObject(url, filename, opts)
914 try:
915 fo._do_grab()
916 if not opts.checkfunc is None:
917 cb_func, cb_args, cb_kwargs = \
918 self._make_callback(opts.checkfunc)
919 obj = CallbackObject()
920 obj.filename = filename
921 obj.url = url
922 apply(cb_func, (obj, )+cb_args, cb_kwargs)
923 finally:
924 fo.close()
925 return filename
926
927 return self._retry(opts, retryfunc, url, filename)
928
929 def urlread(self, url, limit=None, **kwargs):
930 """read the url into a string, up to 'limit' bytes
931 If the limit is exceeded, an exception will be thrown. Note
932 that urlread is NOT intended to be used as a way of saying
933 "I want the first N bytes" but rather 'read the whole file
934 into memory, but don't use too much'
935 """
936 opts = self.opts.derive(**kwargs)
937 (url,parts) = opts.urlparser.parse(url, opts)
938 if limit is not None:
939 limit = limit + 1
940
941 def retryfunc(opts, url, limit):
942 fo = URLGrabberFileObject(url, filename=None, opts=opts)
943 s = ''
944 try:
945 # this is an unfortunate thing. Some file-like objects
946 # have a default "limit" of None, while the built-in (real)
947 # file objects have -1. They each break the other, so for
948 # now, we just force the default if necessary.
949 if limit is None: s = fo.read()
950 else: s = fo.read(limit)
951
952 if not opts.checkfunc is None:
953 cb_func, cb_args, cb_kwargs = \
954 self._make_callback(opts.checkfunc)
955 obj = CallbackObject()
956 obj.data = s
957 obj.url = url
958 apply(cb_func, (obj, )+cb_args, cb_kwargs)
959 finally:
960 fo.close()
961 return s
962
963 s = self._retry(opts, retryfunc, url, limit)
964 if limit and len(s) > limit:
965 raise URLGrabError(8,
966 _('Exceeded limit (%i): %s') % (limit, url))
967 return s
968
969 def _make_callback(self, callback_obj):
970 if callable(callback_obj):
971 return callback_obj, (), {}
972 else:
973 return callback_obj
974
975# create the default URLGrabber used by urlXXX functions.
976# NOTE: actual defaults are set in URLGrabberOptions
977default_grabber = URLGrabber()
978
979class URLGrabberFileObject:
980 """This is a file-object wrapper that supports progress objects
981 and throttling.
982
983 This exists to solve the following problem: lets say you want to
984 drop-in replace a normal open with urlopen. You want to use a
985 progress meter and/or throttling, but how do you do that without
986 rewriting your code? Answer: urlopen will return a wrapped file
987 object that does the progress meter and-or throttling internally.
988 """
989
990 def __init__(self, url, filename, opts):
991 self.url = url
992 self.filename = filename
993 self.opts = opts
994 self.fo = None
995 self._rbuf = ''
996 self._rbufsize = 1024*8
997 self._ttime = time.time()
998 self._tsize = 0
999 self._amount_read = 0
1000 self._opener = None
1001 self._do_open()
1002
1003 def __getattr__(self, name):
1004 """This effectively allows us to wrap at the instance level.
1005 Any attribute not found in _this_ object will be searched for
1006 in self.fo. This includes methods."""
1007 if hasattr(self.fo, name):
1008 return getattr(self.fo, name)
1009 raise AttributeError, name
1010
1011 def _get_opener(self):
1012 """Build a urllib2 OpenerDirector based on request options."""
1013 if self.opts.opener:
1014 return self.opts.opener
1015 elif self._opener is None:
1016 handlers = []
1017 need_keepalive_handler = (have_keepalive and self.opts.keepalive)
1018 need_range_handler = (range_handlers and \
1019 (self.opts.range or self.opts.reget))
1020 # if you specify a ProxyHandler when creating the opener
1021 # it _must_ come before all other handlers in the list or urllib2
1022 # chokes.
1023 if self.opts.proxies:
1024 handlers.append( CachedProxyHandler(self.opts.proxies) )
1025
1026 # -------------------------------------------------------
1027 # OK, these next few lines are a serious kludge to get
1028 # around what I think is a bug in python 2.2's
1029 # urllib2. The basic idea is that default handlers
1030 # get applied first. If you override one (like a
1031 # proxy handler), then the default gets pulled, but
1032 # the replacement goes on the end. In the case of
1033 # proxies, this means the normal handler picks it up
1034 # first and the proxy isn't used. Now, this probably
1035 # only happened with ftp or non-keepalive http, so not
1036 # many folks saw it. The simple approach to fixing it
1037 # is just to make sure you override the other
1038 # conflicting defaults as well. I would LOVE to see
1039 # these go way or be dealt with more elegantly. The
1040 # problem isn't there after 2.2. -MDS 2005/02/24
1041 if not need_keepalive_handler:
1042 handlers.append( urllib2.HTTPHandler() )
1043 if not need_range_handler:
1044 handlers.append( urllib2.FTPHandler() )
1045 # -------------------------------------------------------
1046
1047 ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert,
1048 self.opts.ssl_context)
1049
1050 if need_keepalive_handler:
1051 handlers.append(HTTPHandler())
1052 handlers.append(HTTPSHandler(ssl_factory))
1053 if need_range_handler:
1054 handlers.extend( range_handlers )
1055 handlers.append( auth_handler )
1056 if self.opts.cache_openers:
1057 self._opener = CachedOpenerDirector(ssl_factory, *handlers)
1058 else:
1059 self._opener = ssl_factory.create_opener(*handlers)
1060 # OK, I don't like to do this, but otherwise, we end up with
1061 # TWO user-agent headers.
1062 self._opener.addheaders = []
1063 return self._opener
1064
1065 def _do_open(self):
1066 opener = self._get_opener()
1067
1068 req = urllib2.Request(self.url, self.opts.data) # build request object
1069 self._add_headers(req) # add misc headers that we need
1070 self._build_range(req) # take care of reget and byterange stuff
1071
1072 fo, hdr = self._make_request(req, opener)
1073 if self.reget_time and self.opts.reget == 'check_timestamp':
1074 # do this if we have a local file with known timestamp AND
1075 # we're in check_timestamp reget mode.
1076 fetch_again = 0
1077 try:
1078 modified_tuple = hdr.getdate_tz('last-modified')
1079 modified_stamp = rfc822.mktime_tz(modified_tuple)
1080 if modified_stamp > self.reget_time: fetch_again = 1
1081 except (TypeError,):
1082 fetch_again = 1
1083
1084 if fetch_again:
1085 # the server version is newer than the (incomplete) local
1086 # version, so we should abandon the version we're getting
1087 # and fetch the whole thing again.
1088 fo.close()
1089 self.opts.reget = None
1090 del req.headers['Range']
1091 self._build_range(req)
1092 fo, hdr = self._make_request(req, opener)
1093
1094 (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
1095 path = urllib.unquote(path)
1096 if not (self.opts.progress_obj or self.opts.raw_throttle() \
1097 or self.opts.timeout):
1098 # if we're not using the progress_obj, throttling, or timeout
1099 # we can get a performance boost by going directly to
1100 # the underlying fileobject for reads.
1101 self.read = fo.read
1102 if hasattr(fo, 'readline'):
1103 self.readline = fo.readline
1104 elif self.opts.progress_obj:
1105 try:
1106 length = int(hdr['Content-Length'])
1107 length = length + self._amount_read # Account for regets
1108 except (KeyError, ValueError, TypeError):
1109 length = None
1110
1111 self.opts.progress_obj.start(str(self.filename),
1112 urllib.unquote(self.url),
1113 os.path.basename(path),
1114 length, text=self.opts.text)
1115 self.opts.progress_obj.update(0)
1116 (self.fo, self.hdr) = (fo, hdr)
1117
1118 def _add_headers(self, req):
1119 if self.opts.user_agent:
1120 req.add_header('User-agent', self.opts.user_agent)
1121 try: req_type = req.get_type()
1122 except ValueError: req_type = None
1123 if self.opts.http_headers and req_type in ('http', 'https'):
1124 for h, v in self.opts.http_headers:
1125 req.add_header(h, v)
1126 if self.opts.ftp_headers and req_type == 'ftp':
1127 for h, v in self.opts.ftp_headers:
1128 req.add_header(h, v)
1129
1130 def _build_range(self, req):
1131 self.reget_time = None
1132 self.append = 0
1133 reget_length = 0
1134 rt = None
1135 if have_range and self.opts.reget and type(self.filename) == type(''):
1136 # we have reget turned on and we're dumping to a file
1137 try:
1138 s = os.stat(self.filename)
1139 except OSError:
1140 pass
1141 else:
1142 self.reget_time = s[ST_MTIME]
1143 reget_length = s[ST_SIZE]
1144
1145 # Set initial length when regetting
1146 self._amount_read = reget_length
1147
1148 rt = reget_length, ''
1149 self.append = 1
1150
1151 if self.opts.range:
1152 if not have_range:
1153 raise URLGrabError(10, _('Byte range requested but range '\
1154 'support unavailable'))
1155 rt = self.opts.range
1156 if rt[0]: rt = (rt[0] + reget_length, rt[1])
1157
1158 if rt:
1159 header = range_tuple_to_header(rt)
1160 if header: req.add_header('Range', header)
1161
1162 def _make_request(self, req, opener):
1163 try:
1164 if have_socket_timeout and self.opts.timeout:
1165 old_to = socket.getdefaulttimeout()
1166 socket.setdefaulttimeout(self.opts.timeout)
1167 try:
1168 fo = opener.open(req)
1169 finally:
1170 socket.setdefaulttimeout(old_to)
1171 else:
1172 fo = opener.open(req)
1173 hdr = fo.info()
1174 except ValueError, e:
1175 raise URLGrabError(1, _('Bad URL: %s') % (e, ))
1176 except RangeError, e:
1177 raise URLGrabError(9, str(e))
1178 except urllib2.HTTPError, e:
1179 new_e = URLGrabError(14, str(e))
1180 new_e.code = e.code
1181 new_e.exception = e
1182 raise new_e
1183 except IOError, e:
1184 if hasattr(e, 'reason') and have_socket_timeout and \
1185 isinstance(e.reason, TimeoutError):
1186 raise URLGrabError(12, _('Timeout: %s') % (e, ))
1187 else:
1188 raise URLGrabError(4, _('IOError: %s') % (e, ))
1189 except OSError, e:
1190 raise URLGrabError(5, _('OSError: %s') % (e, ))
1191 except HTTPException, e:
1192 raise URLGrabError(7, _('HTTP Exception (%s): %s') % \
1193 (e.__class__.__name__, e))
1194 else:
1195 return (fo, hdr)
1196
1197 def _do_grab(self):
1198 """dump the file to self.filename."""
1199 if self.append: new_fo = open(self.filename, 'ab')
1200 else: new_fo = open(self.filename, 'wb')
1201 bs = 1024*8
1202 size = 0
1203
1204 block = self.read(bs)
1205 size = size + len(block)
1206 while block:
1207 new_fo.write(block)
1208 block = self.read(bs)
1209 size = size + len(block)
1210
1211 new_fo.close()
1212 try:
1213 modified_tuple = self.hdr.getdate_tz('last-modified')
1214 modified_stamp = rfc822.mktime_tz(modified_tuple)
1215 os.utime(self.filename, (modified_stamp, modified_stamp))
1216 except (TypeError,), e: pass
1217
1218 return size
1219
1220 def _fill_buffer(self, amt=None):
1221 """fill the buffer to contain at least 'amt' bytes by reading
1222 from the underlying file object. If amt is None, then it will
1223 read until it gets nothing more. It updates the progress meter
1224 and throttles after every self._rbufsize bytes."""
1225 # the _rbuf test is only in this first 'if' for speed. It's not
1226 # logically necessary
1227 if self._rbuf and not amt is None:
1228 L = len(self._rbuf)
1229 if amt > L:
1230 amt = amt - L
1231 else:
1232 return
1233
1234 # if we've made it here, then we don't have enough in the buffer
1235 # and we need to read more.
1236
1237 buf = [self._rbuf]
1238 bufsize = len(self._rbuf)
1239 while amt is None or amt:
1240 # first, delay if necessary for throttling reasons
1241 if self.opts.raw_throttle():
1242 diff = self._tsize/self.opts.raw_throttle() - \
1243 (time.time() - self._ttime)
1244 if diff > 0: time.sleep(diff)
1245 self._ttime = time.time()
1246
1247 # now read some data, up to self._rbufsize
1248 if amt is None: readamount = self._rbufsize
1249 else: readamount = min(amt, self._rbufsize)
1250 try:
1251 new = self.fo.read(readamount)
1252 except socket.error, e:
1253 raise URLGrabError(4, _('Socket Error: %s') % (e, ))
1254 except TimeoutError, e:
1255 raise URLGrabError(12, _('Timeout: %s') % (e, ))
1256 except IOError, e:
1257 raise URLGrabError(4, _('IOError: %s') %(e,))
1258 newsize = len(new)
1259 if not newsize: break # no more to read
1260
1261 if amt: amt = amt - newsize
1262 buf.append(new)
1263 bufsize = bufsize + newsize
1264 self._tsize = newsize
1265 self._amount_read = self._amount_read + newsize
1266 if self.opts.progress_obj:
1267 self.opts.progress_obj.update(self._amount_read)
1268
1269 self._rbuf = string.join(buf, '')
1270 return
1271
1272 def read(self, amt=None):
1273 self._fill_buffer(amt)
1274 if amt is None:
1275 s, self._rbuf = self._rbuf, ''
1276 else:
1277 s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
1278 return s
1279
1280 def readline(self, limit=-1):
1281 i = string.find(self._rbuf, '\n')
1282 while i < 0 and not (0 < limit <= len(self._rbuf)):
1283 L = len(self._rbuf)
1284 self._fill_buffer(L + self._rbufsize)
1285 if not len(self._rbuf) > L: break
1286 i = string.find(self._rbuf, '\n', L)
1287
1288 if i < 0: i = len(self._rbuf)
1289 else: i = i+1
1290 if 0 <= limit < len(self._rbuf): i = limit
1291
1292 s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
1293 return s
1294
1295 def close(self):
1296 if self.opts.progress_obj:
1297 self.opts.progress_obj.end(self._amount_read)
1298 self.fo.close()
1299 if self.opts.close_connection:
1300 try: self.fo.close_connection()
1301 except: pass
1302
1303_handler_cache = []
1304def CachedOpenerDirector(ssl_factory = None, *handlers):
1305 for (cached_handlers, opener) in _handler_cache:
1306 if cached_handlers == handlers:
1307 for handler in opener.handlers:
1308 handler.add_parent(opener)
1309 return opener
1310 if not ssl_factory:
1311 ssl_factory = sslfactory.get_factory()
1312 opener = ssl_factory.create_opener(*handlers)
1313 _handler_cache.append( (handlers, opener) )
1314 return opener
1315
1316_proxy_cache = []
1317def CachedProxyHandler(proxies):
1318 for (pdict, handler) in _proxy_cache:
1319 if pdict == proxies:
1320 if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies)
1321 break
1322 else:
1323 for k, v in proxies.items():
1324 utype, url = urllib.splittype(v)
1325 host, other = urllib.splithost(url)
1326 if (utype is None) or (host is None):
1327 raise URLGrabError(13, _('Bad proxy URL: %s') % v)
1328
1329 if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies)
1330 handler = urllib2.ProxyHandler(proxies)
1331 _proxy_cache.append( (proxies, handler) )
1332 return handler
1333
1334#####################################################################
1335# DEPRECATED FUNCTIONS
1336def set_throttle(new_throttle):
1337 """Deprecated. Use: default_grabber.throttle = new_throttle"""
1338 default_grabber.throttle = new_throttle
1339
1340def set_bandwidth(new_bandwidth):
1341 """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1342 default_grabber.bandwidth = new_bandwidth
1343
1344def set_progress_obj(new_progress_obj):
1345 """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1346 default_grabber.progress_obj = new_progress_obj
1347
1348def set_user_agent(new_user_agent):
1349 """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1350 default_grabber.user_agent = new_user_agent
1351
1352def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1353 progress_obj=None, throttle=None, bandwidth=None,
1354 numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1355 """Deprecated. Use: urlgrab() with the retry arg instead"""
1356 kwargs = {'copy_local' : copy_local,
1357 'close_connection' : close_connection,
1358 'progress_obj' : progress_obj,
1359 'throttle' : throttle,
1360 'bandwidth' : bandwidth,
1361 'retry' : numtries,
1362 'retrycodes' : retrycodes,
1363 'checkfunc' : checkfunc
1364 }
1365 return urlgrab(url, filename, **kwargs)
1366
1367
1368#####################################################################
1369# TESTING
1370def _main_test():
1371 import sys
1372 try: url, filename = sys.argv[1:3]
1373 except ValueError:
1374 print 'usage:', sys.argv[0], \
1375 '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1376 sys.exit()
1377
1378 kwargs = {}
1379 for a in sys.argv[3:]:
1380 k, v = string.split(a, '=', 1)
1381 kwargs[k] = int(v)
1382
1383 set_throttle(1.0)
1384 set_bandwidth(32 * 1024)
1385 print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
1386 default_grabber.bandwidth)
1387
1388 try: from progress import text_progress_meter
1389 except ImportError, e: pass
1390 else: kwargs['progress_obj'] = text_progress_meter()
1391
1392 try: name = apply(urlgrab, (url, filename), kwargs)
1393 except URLGrabError, e: print e
1394 else: print 'LOCAL FILE:', name
1395
1396
1397def _retry_test():
1398 import sys
1399 try: url, filename = sys.argv[1:3]
1400 except ValueError:
1401 print 'usage:', sys.argv[0], \
1402 '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1403 sys.exit()
1404
1405 kwargs = {}
1406 for a in sys.argv[3:]:
1407 k, v = string.split(a, '=', 1)
1408 kwargs[k] = int(v)
1409
1410 try: from progress import text_progress_meter
1411 except ImportError, e: pass
1412 else: kwargs['progress_obj'] = text_progress_meter()
1413
1414 def cfunc(filename, hello, there='foo'):
1415 print hello, there
1416 import random
1417 rnum = random.random()
1418 if rnum < .5:
1419 print 'forcing retry'
1420 raise URLGrabError(-1, 'forcing retry')
1421 if rnum < .75:
1422 print 'forcing failure'
1423 raise URLGrabError(-2, 'forcing immediate failure')
1424 print 'success'
1425 return
1426
1427 kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1428 try: name = apply(retrygrab, (url, filename), kwargs)
1429 except URLGrabError, e: print e
1430 else: print 'LOCAL FILE:', name
1431
1432def _file_object_test(filename=None):
1433 import random, cStringIO, sys
1434 if filename is None:
1435 filename = __file__
1436 print 'using file "%s" for comparisons' % filename
1437 fo = open(filename)
1438 s_input = fo.read()
1439 fo.close()
1440
1441 for testfunc in [_test_file_object_smallread,
1442 _test_file_object_readall,
1443 _test_file_object_readline,
1444 _test_file_object_readlines]:
1445 fo_input = cStringIO.StringIO(s_input)
1446 fo_output = cStringIO.StringIO()
1447 wrapper = URLGrabberFileObject(fo_input, None, 0)
1448 print 'testing %-30s ' % testfunc.__name__,
1449 testfunc(wrapper, fo_output)
1450 s_output = fo_output.getvalue()
1451 if s_output == s_input: print 'passed'
1452 else: print 'FAILED'
1453
1454def _test_file_object_smallread(wrapper, fo_output):
1455 while 1:
1456 s = wrapper.read(23)
1457 fo_output.write(s)
1458 if not s: return
1459
1460def _test_file_object_readall(wrapper, fo_output):
1461 s = wrapper.read()
1462 fo_output.write(s)
1463
1464def _test_file_object_readline(wrapper, fo_output):
1465 while 1:
1466 s = wrapper.readline()
1467 fo_output.write(s)
1468 if not s: return
1469
1470def _test_file_object_readlines(wrapper, fo_output):
1471 li = wrapper.readlines()
1472 fo_output.write(string.join(li, ''))
1473
1474if __name__ == '__main__':
1475 _main_test()
1476 _retry_test()
1477 _file_object_test('test')