Django

Code

root/django/branches/newforms-admin/django/http/multipartparser.py

Revision 7922, 22.6 kB (checked in by brosner, 5 months ago)

newforms-admin: Merged from trunk up to [7917].

Line 
1 """
2 Multi-part parsing for file uploads.
3
4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5 file upload handlers for processing.
6 """
7 import cgi
8 from django.conf import settings
9 from django.core.exceptions import SuspiciousOperation
10 from django.utils.datastructures import MultiValueDict
11 from django.utils.encoding import force_unicode
12 from django.utils.text import unescape_entities
13 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
14
15 __all__ = ('MultiPartParser','MultiPartParserError','InputStreamExhausted')
16
17 class MultiPartParserError(Exception):
18     pass
19
20 class InputStreamExhausted(Exception):
21     """
22     No more reads are allowed from this device.
23     """
24     pass
25
26 RAW = "raw"
27 FILE = "file"
28 FIELD = "field"
29
30 class MultiPartParser(object):
31     """
32     A rfc2388 multipart/form-data parser.
33
34     ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
35     and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
36     ``file_upload_dir`` is defined files will be streamed to temporary files in
37     that directory.
38     """
39     def __init__(self, META, input_data, upload_handlers, encoding=None):
40         """
41         Initialize the MultiPartParser object.
42
43         :META:
44             The standard ``META`` dictionary in Django request objects.
45         :input_data:
46             The raw post data, as a bytestring.
47         :upload_handler:
48             An UploadHandler instance that performs operations on the uploaded
49             data.
50         :encoding:
51             The encoding with which to treat the incoming data.
52         """
53
54         #
55         # Content-Type should containt multipart and the boundary information.
56         #
57
58         content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
59         if not content_type.startswith('multipart/'):
60             raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
61
62         # Parse the header to get the boundary to split the parts.
63         ctypes, opts = parse_header(content_type)
64         boundary = opts.get('boundary')
65         if not boundary or not cgi.valid_boundary(boundary):
66             raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
67
68
69         #
70         # Content-Length should contain the length of the body we are about
71         # to receive.
72         #
73         try:
74             content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
75         except (ValueError, TypeError):
76             # For now set it to 0; we'll try again later on down.
77             content_length = 0
78
79         if content_length <= 0:
80             # This means we shouldn't continue...raise an error.
81             raise MultiPartParserError("Invalid content length: %r" % content_length)
82
83         self._boundary = boundary
84         self._input_data = input_data
85
86         # For compatibility with low-level network APIs (with 32-bit integers),
87         # the chunk size should be < 2^31, but still divisible by 4.
88         self._chunk_size = min(2**31-4, *[x.chunk_size for x in upload_handlers if x.chunk_size])
89
90         self._meta = META
91         self._encoding = encoding or settings.DEFAULT_CHARSET
92         self._content_length = content_length
93         self._upload_handlers = upload_handlers
94
95     def parse(self):
96         """
97         Parse the POST data and break it into a FILES MultiValueDict and a POST
98         MultiValueDict.
99
100         Returns a tuple containing the POST and FILES dictionary, respectively.
101         """
102         # We have to import QueryDict down here to avoid a circular import.
103         from django.http import QueryDict
104
105         encoding = self._encoding
106         handlers = self._upload_handlers
107
108         limited_input_data = LimitBytes(self._input_data, self._content_length)
109
110         # See if the handler will want to take care of the parsing.
111         # This allows overriding everything if somebody wants it.
112         for handler in handlers:
113             result = handler.handle_raw_input(limited_input_data,
114                                               self._meta,
115                                               self._content_length,
116                                               self._boundary,
117                                               encoding)
118             if result is not None:
119                 return result[0], result[1]
120
121         # Create the data structures to be used later.
122         self._post = QueryDict('', mutable=True)
123         self._files = MultiValueDict()
124
125         # Instantiate the parser and stream:
126         stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
127
128         # Whether or not to signal a file-completion at the beginning of the loop.
129         old_field_name = None
130         counters = [0] * len(handlers)
131
132         try:
133             for item_type, meta_data, field_stream in Parser(stream, self._boundary):
134                 if old_field_name:
135                     # We run this at the beginning of the next loop
136                     # since we cannot be sure a file is complete until
137                     # we hit the next boundary/part of the multipart content.
138                     self.handle_file_complete(old_field_name, counters)
139                     old_field_name = None
140
141                 try:
142                     disposition = meta_data['content-disposition'][1]
143                     field_name = disposition['name'].strip()
144                 except (KeyError, IndexError, AttributeError):
145                     continue
146
147                 transfer_encoding = meta_data.get('content-transfer-encoding')
148                 field_name = force_unicode(field_name, encoding, errors='replace')
149
150                 if item_type == FIELD:
151                     # This is a post field, we can just set it in the post
152                     if transfer_encoding == 'base64':
153                         raw_data = field_stream.read()
154                         try:
155                             data = str(raw_data).decode('base64')
156                         except:
157                             data = raw_data
158                     else:
159                         data = field_stream.read()
160
161                     self._post.appendlist(field_name,
162                                           force_unicode(data, encoding, errors='replace'))
163                 elif item_type == FILE:
164                     # This is a file, use the handler...
165                     file_successful = True
166                     file_name = disposition.get('filename')
167                     if not file_name:
168                         continue
169                     file_name = force_unicode(file_name, encoding, errors='replace')
170                     file_name = self.IE_sanitize(unescape_entities(file_name))
171
172                     content_type = meta_data.get('content-type', ('',))[0].strip()
173                     try:
174                         charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
175                     except:
176                         charset = None
177
178                     try:
179                         content_length = int(meta_data.get('content-length')[0])
180                     except (IndexError, TypeError, ValueError):
181                         content_length = None
182
183                     counters = [0] * len(handlers)
184                     try:
185                         for handler in handlers:
186                             try:
187                                 handler.new_file(field_name, file_name,
188                                                  content_type, content_length,
189                                                  charset)
190                             except StopFutureHandlers:
191                                 break
192
193                         for chunk in field_stream:
194                             if transfer_encoding == 'base64':
195                                 # We only special-case base64 transfer encoding
196                                 try:
197                                     chunk = str(chunk).decode('base64')
198                                 except Exception, e:
199                                     # Since this is only a chunk, any error is an unfixable error.
200                                     raise MultiPartParserError("Could not decode base64 data: %r" % e)
201
202                             for i, handler in enumerate(handlers):
203                                 chunk_length = len(chunk)
204                                 chunk = handler.receive_data_chunk(chunk,
205                                                                    counters[i])
206                                 counters[i] += chunk_length
207                                 if chunk is None:
208                                     # If the chunk received by the handler is None, then don't continue.
209                                     break
210
211                     except SkipFile, e:
212                         file_successful = False
213                         # Just use up the rest of this file...
214                         exhaust(field_stream)
215                     else:
216                         # Handle file upload completions on next iteration.
217                         old_field_name = field_name
218                 else:
219                     # If this is neither a FIELD or a FILE, just exhaust the stream.
220                     exhaust(stream)
221         except StopUpload, e:
222             if not e.connection_reset:
223                 exhaust(limited_input_data)
224         else:
225             # Make sure that the request data is all fed
226             exhaust(limited_input_data)
227
228         # Signal that the upload has completed.
229         for handler in handlers:
230             retval = handler.upload_complete()
231             if retval:
232                 break
233
234         return self._post, self._files
235
236     def handle_file_complete(self, old_field_name, counters):
237         """
238         Handle all the signalling that takes place when a file is complete.
239         """
240         for i, handler in enumerate(self._upload_handlers):
241             file_obj = handler.file_complete(counters[i])
242             if file_obj:
243                 # If it returns a file object, then set the files dict.
244                 self._files.appendlist(force_unicode(old_field_name,
245                                                      self._encoding,
246                                                      errors='replace'),
247                                        file_obj)
248                 break
249
250     def IE_sanitize(self, filename):
251         """Cleanup filename from Internet Explorer full paths."""
252         return filename and filename[filename.rfind("\\")+1:].strip()
253
254 class LazyStream(object):
255     """
256     The LazyStream wrapper allows one to get and "unget" bytes from a stream.
257
258     Given a producer object (an iterator that yields bytestrings), the
259     LazyStream object will support iteration, reading, and keeping a "look-back"
260     variable in case you need to "unget" some bytes.
261     """
262     def __init__(self, producer, length=None):
263         """
264         Every LazyStream must have a producer when instantiated.
265
266         A producer is an iterable that returns a string each time it
267         is called.
268         """
269         self._producer = producer
270         self._empty = False
271         self._leftover = ''
272         self.length = length
273         self.position = 0
274         self._remaining = length
275         self._unget_history = []
276
277     def tell(self):
278         return self.position
279
280     def read(self, size=None):
281         def parts():
282             remaining = (size is not None and [size] or [self._remaining])[0]
283             # do the whole thing in one shot if no limit was provided.
284             if remaining is None:
285                 yield ''.join(self)
286                 return
287
288             # otherwise do some bookkeeping to return exactly enough
289             # of the stream and stashing any extra content we get from
290             # the producer
291             while remaining != 0:
292                 assert remaining > 0, 'remaining bytes to read should never go negative'
293
294                 chunk = self.next()
295
296                 emitting = chunk[:remaining]
297                 self.unget(chunk[remaining:])
298                 remaining -= len(emitting)
299                 yield emitting
300
301         out = ''.join(parts())
302         return out
303
304     def next(self):
305         """
306         Used when the exact number of bytes to read is unimportant.
307
308         This procedure just returns whatever is chunk is conveniently returned
309         from the iterator instead. Useful to avoid unnecessary bookkeeping if
310         performance is an issue.
311         """
312         if self._leftover:
313             output = self._leftover
314             self._leftover = ''
315         else:
316             output = self._producer.next()
317             self._unget_history = []
318         self.position += len(output)
319         return output
320
321     def close(self):
322         """
323         Used to invalidate/disable this lazy stream.
324
325         Replaces the producer with an empty list. Any leftover bytes that have
326         already been read will still be reported upon read() and/or next().
327         """
328         self._producer = []
329
330     def __iter__(self):
331         return self
332
333     def unget(self, bytes):
334         """
335         Places bytes back onto the front of the lazy stream.
336
337         Future calls to read() will return those bytes first. The
338         stream position and thus tell() will be rewound.
339         """
340         if not bytes:
341             return
342         self._update_unget_history(len(bytes))
343         self.position -= len(bytes)
344         self._leftover = ''.join([bytes, self._leftover])
345
346     def _update_unget_history(self, num_bytes):
347         """
348         Updates the unget history as a sanity check to see if we've pushed
349         back the same number of bytes in one chunk. If we keep ungetting the
350         same number of bytes many times (here, 50), we're mostly likely in an
351         infinite loop of some sort. This is usually caused by a
352         maliciously-malformed MIME request.
353         """
354         self._unget_history = [num_bytes] + self._unget_history[:49]
355         number_equal = len([current_number for current_number in self._unget_history
356                             if current_number == num_bytes])
357
358         if number_equal > 40:
359             raise SuspiciousOperation(
360                 "The multipart parser got stuck, which shouldn't happen with"
361                 " normal uploaded files. Check for malicious upload activity;"
362                 " if there is none, report this to the Django developers."
363             )
364
365 class ChunkIter(object):
366     """
367     An iterable that will yield chunks of data. Given a file-like object as the
368     constructor, this object will yield chunks of read operations from that
369     object.
370     """
371     def __init__(self, flo, chunk_size=64 * 1024):
372         self.flo = flo
373         self.chunk_size = chunk_size
374
375     def next(self):
376         try:
377             data = self.flo.read(self.chunk_size)
378         except InputStreamExhausted:
379             raise StopIteration()
380         if data:
381             return data
382         else:
383             raise StopIteration()
384
385     def __iter__(self):
386         return self
387
388 class LimitBytes(object):
389     """ Limit bytes for a file object. """
390     def __init__(self, fileobject, length):
391         self._file = fileobject
392         self.remaining = length
393
394     def read(self, num_bytes=None):
395         """
396         Read data from the underlying file.
397         If you ask for too much or there isn't anything left,
398         this will raise an InputStreamExhausted error.
399         """
400         if self.remaining <= 0:
401             raise InputStreamExhausted()
402         if num_bytes is None:
403             num_bytes = self.remaining
404         else:
405             num_bytes = min(num_bytes, self.remaining)
406         self.remaining -= num_bytes
407         return self._file.read(num_bytes)
408
409 class InterBoundaryIter(object):
410     """
411     A Producer that will iterate over boundaries.
412     """
413     def __init__(self, stream, boundary):
414         self._stream = stream
415         self._boundary = boundary
416
417     def __iter__(self):
418         return self
419
420     def next(self):
421         try:
422             return LazyStream(BoundaryIter(self._stream, self._boundary))
423         except InputStreamExhausted:
424             raise StopIteration()
425
426 class BoundaryIter(object):
427     """
428     A Producer that is sensitive to boundaries.
429
430     Will happily yield bytes until a boundary is found. Will yield the bytes
431     before the boundary, throw away the boundary bytes themselves, and push the
432     post-boundary bytes back on the stream.
433
434     The future calls to .next() after locating the boundary will raise a
435     StopIteration exception.
436     """
437
438     def __init__(self, stream, boundary):
439         self._stream = stream
440         self._boundary = boundary
441         self._done = False
442         # rollback an additional six bytes because the format is like
443         # this: CRLF<boundary>[--CRLF]
444         self._rollback = len(boundary) + 6
445
446         # Try to use mx fast string search if available. Otherwise
447         # use Python find. Wrap the latter for consistency.
448         unused_char = self._stream.read(1)
449         if not unused_char:
450             raise InputStreamExhausted()
451         self._stream.unget(unused_char)
452         try:
453             from mx.TextTools import FS
454             self._fs = FS(boundary).find
455         except ImportError:
456             self._fs = lambda data: data.find(boundary)
457
458     def __iter__(self):
459         return self
460
461     def next(self):
462         if self._done:
463             raise StopIteration()
464
465         stream = self._stream
466         rollback = self._rollback
467
468         bytes_read = 0
469         chunks = []
470         for bytes in stream:
471             bytes_read += len(bytes)
472             chunks.append(bytes)
473             if bytes_read > rollback:
474                 break
475             if not bytes:
476                 break
477         else:
478             self._done = True
479
480         if not chunks:
481             raise StopIteration()
482
483         chunk = ''.join(chunks)
484         boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
485
486         if boundary:
487             end, next = boundary
488             stream.unget(chunk[next:])
489             self._done = True
490             return chunk[:end]
491         else:
492             # make sure we dont treat a partial boundary (and
493             # its separators) as data
494             if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
495                 # There's nothing left, we should just return and mark as done.
496                 self._done = True
497                 return chunk
498             else:
499                 stream.unget(chunk[-rollback:])
500                 return chunk[:-rollback]
501
502     def _find_boundary(self, data, eof = False):
503         """
504         Finds a multipart boundary in data.
505
506         Should no boundry exist in the data None is returned instead. Otherwise
507         a tuple containing the indices of the following are returned:
508
509          * the end of current encapsulation
510          * the start of the next encapsulation
511         """
512         index = self._fs(data)
513         if index < 0:
514             return None
515         else:
516             end = index
517             next = index + len(self._boundary)
518             data_len = len(data) - 1
519             # backup over CRLF
520             if data[max(0,end-1)] == '\n':
521                 end -= 1
522             if data[max(0,end-1)] == '\r':
523                 end -= 1
524             # skip over --CRLF
525             #if data[min(data_len,next)] == '-':
526             #    next += 1
527             #if data[min(data_len,next)] == '-':
528             #    next += 1
529             #if data[min(data_len,next)] == '\r':
530             #    next += 1
531             #if data[min(data_len,next)] == '\n':
532             #    next += 1
533             return end, next
534
535 def exhaust(stream_or_iterable):
536     """
537     Completely exhausts an iterator or stream.
538
539     Raise a MultiPartParserError if the argument is not a stream or an iterable.
540     """
541     iterator = None
542     try:
543         iterator = iter(stream_or_iterable)
544     except TypeError:
545         iterator = ChunkIter(stream_or_iterable, 16384)
546
547     if iterator is None:
548         raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
549
550     for __ in iterator:
551         pass
552
553 def parse_boundary_stream(stream, max_header_size):
554     """
555     Parses one and exactly one stream that encapsulates a boundary.
556     """
557     # Stream at beginning of header, look for end of header
558     # and parse it if found. The header must fit within one
559     # chunk.
560     chunk = stream.read(max_header_size)
561
562     # 'find' returns the top of these four bytes, so we'll
563     # need to munch them later to prevent them from polluting
564     # the payload.
565     header_end = chunk.find('\r\n\r\n')
566
567     def _parse_header(line):
568         main_value_pair, params = parse_header(line)
569         try:
570             name, value = main_value_pair.split(':', 1)
571         except:
572             raise ValueError("Invalid header: %r" % line)
573         return name, (value, params)
574
575     if header_end == -1:
576         # we find no header, so we just mark this fact and pass on
577         # the stream verbatim
578         stream.unget(chunk)
579         return (RAW, {}, stream)
580
581     header = chunk[:header_end]
582
583     # here we place any excess chunk back onto the stream, as
584     # well as throwing away the CRLFCRLF bytes from above.
585     stream.unget(chunk[header_end + 4:])
586
587     TYPE = RAW
588     outdict = {}
589
590     # Eliminate blank lines
591     for line</