Django

Code

root/django/trunk/django/http/multipartparser.py

Revision 8047, 22.1 kB (checked in by adrian, 1 month ago)

Fixed #7848 -- Removed a bunch of code that wasn't contributing to society. Thanks, julien

Line 
1 """
2 Multi-part parsing for file uploads.
3
4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5 file upload handlers for processing.
6 """
7
8 import cgi
9 from django.conf import settings
10 from django.core.exceptions import SuspiciousOperation
11 from django.utils.datastructures import MultiValueDict
12 from django.utils.encoding import force_unicode
13 from django.utils.text import unescape_entities
14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
15
16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
17
18 class MultiPartParserError(Exception):
19     pass
20
21 class InputStreamExhausted(Exception):
22     """
23     No more reads are allowed from this device.
24     """
25     pass
26
27 RAW = "raw"
28 FILE = "file"
29 FIELD = "field"
30
31 class MultiPartParser(object):
32     """
33     A rfc2388 multipart/form-data parser.
34
35     ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
36     and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
37     ``file_upload_dir`` is defined files will be streamed to temporary files in
38     that directory.
39     """
40     def __init__(self, META, input_data, upload_handlers, encoding=None):
41         """
42         Initialize the MultiPartParser object.
43
44         :META:
45             The standard ``META`` dictionary in Django request objects.
46         :input_data:
47             The raw post data, as a bytestring.
48         :upload_handler:
49             An UploadHandler instance that performs operations on the uploaded
50             data.
51         :encoding:
52             The encoding with which to treat the incoming data.
53         """
54
55         #
56         # Content-Type should containt multipart and the boundary information.
57         #
58
59         content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
60         if not content_type.startswith('multipart/'):
61             raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
62
63         # Parse the header to get the boundary to split the parts.
64         ctypes, opts = parse_header(content_type)
65         boundary = opts.get('boundary')
66         if not boundary or not cgi.valid_boundary(boundary):
67             raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
68
69
70         #
71         # Content-Length should contain the length of the body we are about
72         # to receive.
73         #
74         try:
75             content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
76         except (ValueError, TypeError):
77             # For now set it to 0; we'll try again later on down.
78             content_length = 0
79
80         if content_length <= 0:
81             # This means we shouldn't continue...raise an error.
82             raise MultiPartParserError("Invalid content length: %r" % content_length)
83
84         self._boundary = boundary
85         self._input_data = input_data
86
87         # For compatibility with low-level network APIs (with 32-bit integers),
88         # the chunk size should be < 2^31, but still divisible by 4.
89         self._chunk_size = min(2**31-4, *[x.chunk_size for x in upload_handlers if x.chunk_size])
90
91         self._meta = META
92         self._encoding = encoding or settings.DEFAULT_CHARSET
93         self._content_length = content_length
94         self._upload_handlers = upload_handlers
95
96     def parse(self):
97         """
98         Parse the POST data and break it into a FILES MultiValueDict and a POST
99         MultiValueDict.
100
101         Returns a tuple containing the POST and FILES dictionary, respectively.
102         """
103         # We have to import QueryDict down here to avoid a circular import.
104         from django.http import QueryDict
105
106         encoding = self._encoding
107         handlers = self._upload_handlers
108
109         limited_input_data = LimitBytes(self._input_data, self._content_length)
110
111         # See if the handler will want to take care of the parsing.
112         # This allows overriding everything if somebody wants it.
113         for handler in handlers:
114             result = handler.handle_raw_input(limited_input_data,
115                                               self._meta,
116                                               self._content_length,
117                                               self._boundary,
118                                               encoding)
119             if result is not None:
120                 return result[0], result[1]
121
122         # Create the data structures to be used later.
123         self._post = QueryDict('', mutable=True)
124         self._files = MultiValueDict()
125
126         # Instantiate the parser and stream:
127         stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
128
129         # Whether or not to signal a file-completion at the beginning of the loop.
130         old_field_name = None
131         counters = [0] * len(handlers)
132
133         try:
134             for item_type, meta_data, field_stream in Parser(stream, self._boundary):
135                 if old_field_name:
136                     # We run this at the beginning of the next loop
137                     # since we cannot be sure a file is complete until
138                     # we hit the next boundary/part of the multipart content.
139                     self.handle_file_complete(old_field_name, counters)
140                     old_field_name = None
141
142                 try:
143                     disposition = meta_data['content-disposition'][1]
144                     field_name = disposition['name'].strip()
145                 except (KeyError, IndexError, AttributeError):
146                     continue
147
148                 transfer_encoding = meta_data.get('content-transfer-encoding')
149                 field_name = force_unicode(field_name, encoding, errors='replace')
150
151                 if item_type == FIELD:
152                     # This is a post field, we can just set it in the post
153                     if transfer_encoding == 'base64':
154                         raw_data = field_stream.read()
155                         try:
156                             data = str(raw_data).decode('base64')
157                         except:
158                             data = raw_data
159                     else:
160                         data = field_stream.read()
161
162                     self._post.appendlist(field_name,
163                                           force_unicode(data, encoding, errors='replace'))
164                 elif item_type == FILE:
165                     # This is a file, use the handler...
166                     file_name = disposition.get('filename')
167                     if not file_name:
168                         continue
169                     file_name = force_unicode(file_name, encoding, errors='replace')
170                     file_name = self.IE_sanitize(unescape_entities(file_name))
171
172                     content_type = meta_data.get('content-type', ('',))[0].strip()
173                     try:
174                         charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
175                     except:
176                         charset = None
177
178                     try:
179                         content_length = int(meta_data.get('content-length')[0])
180                     except (IndexError, TypeError, ValueError):
181                         content_length = None
182
183                     counters = [0] * len(handlers)
184                     try:
185                         for handler in handlers:
186                             try:
187                                 handler.new_file(field_name, file_name,
188                                                  content_type, content_length,
189                                                  charset)
190                             except StopFutureHandlers:
191                                 break
192
193                         for chunk in field_stream:
194                             if transfer_encoding == 'base64':
195                                 # We only special-case base64 transfer encoding
196                                 try:
197                                     chunk = str(chunk).decode('base64')
198                                 except Exception, e:
199                                     # Since this is only a chunk, any error is an unfixable error.
200                                     raise MultiPartParserError("Could not decode base64 data: %r" % e)
201
202                             for i, handler in enumerate(handlers):
203                                 chunk_length = len(chunk)
204                                 chunk = handler.receive_data_chunk(chunk,
205                                                                    counters[i])
206                                 counters[i] += chunk_length
207                                 if chunk is None:
208                                     # If the chunk received by the handler is None, then don't continue.
209                                     break
210
211                     except SkipFile, e:
212                         # Just use up the rest of this file...
213                         exhaust(field_stream)
214                     else:
215                         # Handle file upload completions on next iteration.
216                         old_field_name = field_name
217                 else:
218                     # If this is neither a FIELD or a FILE, just exhaust the stream.
219                     exhaust(stream)
220         except StopUpload, e:
221             if not e.connection_reset:
222                 exhaust(limited_input_data)
223         else:
224             # Make sure that the request data is all fed
225             exhaust(limited_input_data)
226
227         # Signal that the upload has completed.
228         for handler in handlers:
229             retval = handler.upload_complete()
230             if retval:
231                 break
232
233         return self._post, self._files
234
235     def handle_file_complete(self, old_field_name, counters):
236         """
237         Handle all the signalling that takes place when a file is complete.
238         """
239         for i, handler in enumerate(self._upload_handlers):
240             file_obj = handler.file_complete(counters[i])
241             if file_obj:
242                 # If it returns a file object, then set the files dict.
243                 self._files.appendlist(force_unicode(old_field_name,
244                                                      self._encoding,
245                                                      errors='replace'),
246                                        file_obj)
247                 break
248
249     def IE_sanitize(self, filename):
250         """Cleanup filename from Internet Explorer full paths."""
251         return filename and filename[filename.rfind("\\")+1:].strip()
252
253 class LazyStream(object):
254     """
255     The LazyStream wrapper allows one to get and "unget" bytes from a stream.
256
257     Given a producer object (an iterator that yields bytestrings), the
258     LazyStream object will support iteration, reading, and keeping a "look-back"
259     variable in case you need to "unget" some bytes.
260     """
261     def __init__(self, producer, length=None):
262         """
263         Every LazyStream must have a producer when instantiated.
264
265         A producer is an iterable that returns a string each time it
266         is called.
267         """
268         self._producer = producer
269         self._empty = False
270         self._leftover = ''
271         self.length = length
272         self.position = 0
273         self._remaining = length
274         self._unget_history = []
275
276     def tell(self):
277         return self.position
278
279     def read(self, size=None):
280         def parts():
281             remaining = (size is not None and [size] or [self._remaining])[0]
282             # do the whole thing in one shot if no limit was provided.
283             if remaining is None:
284                 yield ''.join(self)
285                 return
286
287             # otherwise do some bookkeeping to return exactly enough
288             # of the stream and stashing any extra content we get from
289             # the producer
290             while remaining != 0:
291                 assert remaining > 0, 'remaining bytes to read should never go negative'
292
293                 chunk = self.next()
294
295                 emitting = chunk[:remaining]
296                 self.unget(chunk[remaining:])
297                 remaining -= len(emitting)
298                 yield emitting
299
300         out = ''.join(parts())
301         return out
302
303     def next(self):
304         """
305         Used when the exact number of bytes to read is unimportant.
306
307         This procedure just returns whatever is chunk is conveniently returned
308         from the iterator instead. Useful to avoid unnecessary bookkeeping if
309         performance is an issue.
310         """
311         if self._leftover:
312             output = self._leftover
313             self._leftover = ''
314         else:
315             output = self._producer.next()
316             self._unget_history = []
317         self.position += len(output)
318         return output
319
320     def close(self):
321         """
322         Used to invalidate/disable this lazy stream.
323
324         Replaces the producer with an empty list. Any leftover bytes that have
325         already been read will still be reported upon read() and/or next().
326         """
327         self._producer = []
328
329     def __iter__(self):
330         return self
331
332     def unget(self, bytes):
333         """
334         Places bytes back onto the front of the lazy stream.
335
336         Future calls to read() will return those bytes first. The
337         stream position and thus tell() will be rewound.
338         """
339         if not bytes:
340             return
341         self._update_unget_history(len(bytes))
342         self.position -= len(bytes)
343         self._leftover = ''.join([bytes, self._leftover])
344
345     def _update_unget_history(self, num_bytes):
346         """
347         Updates the unget history as a sanity check to see if we've pushed
348         back the same number of bytes in one chunk. If we keep ungetting the
349         same number of bytes many times (here, 50), we're mostly likely in an
350         infinite loop of some sort. This is usually caused by a
351         maliciously-malformed MIME request.
352         """
353         self._unget_history = [num_bytes] + self._unget_history[:49]
354         number_equal = len([current_number for current_number in self._unget_history
355                             if current_number == num_bytes])
356
357         if number_equal > 40:
358             raise SuspiciousOperation(
359                 "The multipart parser got stuck, which shouldn't happen with"
360                 " normal uploaded files. Check for malicious upload activity;"
361                 " if there is none, report this to the Django developers."
362             )
363
364 class ChunkIter(object):
365     """
366     An iterable that will yield chunks of data. Given a file-like object as the
367     constructor, this object will yield chunks of read operations from that
368     object.
369     """
370     def __init__(self, flo, chunk_size=64 * 1024):
371         self.flo = flo
372         self.chunk_size = chunk_size
373
374     def next(self):
375         try:
376             data = self.flo.read(self.chunk_size)
377         except InputStreamExhausted:
378             raise StopIteration()
379         if data:
380             return data
381         else:
382             raise StopIteration()
383
384     def __iter__(self):
385         return self
386
387 class LimitBytes(object):
388     """ Limit bytes for a file object. """
389     def __init__(self, fileobject, length):
390         self._file = fileobject
391         self.remaining = length
392
393     def read(self, num_bytes=None):
394         """
395         Read data from the underlying file.
396         If you ask for too much or there isn't anything left,
397         this will raise an InputStreamExhausted error.
398         """
399         if self.remaining <= 0:
400             raise InputStreamExhausted()
401         if num_bytes is None:
402             num_bytes = self.remaining
403         else:
404             num_bytes = min(num_bytes, self.remaining)
405         self.remaining -= num_bytes
406         return self._file.read(num_bytes)
407
408 class InterBoundaryIter(object):
409     """
410     A Producer that will iterate over boundaries.
411     """
412     def __init__(self, stream, boundary):
413         self._stream = stream
414         self._boundary = boundary
415
416     def __iter__(self):
417         return self
418
419     def next(self):
420         try:
421             return LazyStream(BoundaryIter(self._stream, self._boundary))
422         except InputStreamExhausted:
423             raise StopIteration()
424
425 class BoundaryIter(object):
426     """
427     A Producer that is sensitive to boundaries.
428
429     Will happily yield bytes until a boundary is found. Will yield the bytes
430     before the boundary, throw away the boundary bytes themselves, and push the
431     post-boundary bytes back on the stream.
432
433     The future calls to .next() after locating the boundary will raise a
434     StopIteration exception.
435     """
436
437     def __init__(self, stream, boundary):
438         self._stream = stream
439         self._boundary = boundary
440         self._done = False
441         # rollback an additional six bytes because the format is like
442         # this: CRLF<boundary>[--CRLF]
443         self._rollback = len(boundary) + 6
444
445         # Try to use mx fast string search if available. Otherwise
446         # use Python find. Wrap the latter for consistency.
447         unused_char = self._stream.read(1)
448         if not unused_char:
449             raise InputStreamExhausted()
450         self._stream.unget(unused_char)
451         try:
452             from mx.TextTools import FS
453             self._fs = FS(boundary).find
454         except ImportError:
455             self._fs = lambda data: data.find(boundary)
456
457     def __iter__(self):
458         return self
459
460     def next(self):
461         if self._done:
462             raise StopIteration()
463
464         stream = self._stream
465         rollback = self._rollback
466
467         bytes_read = 0
468         chunks = []
469         for bytes in stream:
470             bytes_read += len(bytes)
471             chunks.append(bytes)
472             if bytes_read > rollback:
473                 break
474             if not bytes:
475                 break
476         else:
477             self._done = True
478
479         if not chunks:
480             raise StopIteration()
481
482         chunk = ''.join(chunks)
483         boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
484
485         if boundary:
486             end, next = boundary
487             stream.unget(chunk[next:])
488             self._done = True
489             return chunk[:end]
490         else:
491             # make sure we dont treat a partial boundary (and
492             # its separators) as data
493             if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
494                 # There's nothing left, we should just return and mark as done.
495                 self._done = True
496                 return chunk
497             else:
498                 stream.unget(chunk[-rollback:])
499                 return chunk[:-rollback]
500
501     def _find_boundary(self, data, eof = False):
502         """
503         Finds a multipart boundary in data.
504
505         Should no boundry exist in the data None is returned instead. Otherwise
506         a tuple containing the indices of the following are returned:
507
508          * the end of current encapsulation
509          * the start of the next encapsulation
510         """
511         index = self._fs(data)
512         if index < 0:
513             return None
514         else:
515             end = index
516             next = index + len(self._boundary)
517             # backup over CRLF
518             if data[max(0,end-1)] == '\n':
519                 end -= 1
520             if data[max(0,end-1)] == '\r':
521                 end -= 1
522             return end, next
523
524 def exhaust(stream_or_iterable):
525     """
526     Completely exhausts an iterator or stream.
527
528     Raise a MultiPartParserError if the argument is not a stream or an iterable.
529     """
530     iterator = None
531     try:
532         iterator = iter(stream_or_iterable)
533     except TypeError:
534         iterator = ChunkIter(stream_or_iterable, 16384)
535
536     if iterator is None:
537         raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
538
539     for __ in iterator:
540         pass
541
542 def parse_boundary_stream(stream, max_header_size):
543     """
544     Parses one and exactly one stream that encapsulates a boundary.
545     """
546     # Stream at beginning of header, look for end of header
547     # and parse it if found. The header must fit within one
548     # chunk.
549     chunk = stream.read(max_header_size)
550
551     # 'find' returns the top of these four bytes, so we'll
552     # need to munch them later to prevent them from polluting
553     # the payload.
554     header_end = chunk.find('\r\n\r\n')
555
556     def _parse_header(line):
557         main_value_pair, params = parse_header(line)
558         try:
559             name, value = main_value_pair.split(':', 1)
560         except:
561             raise ValueError("Invalid header: %r" % line)
562         return name, (value, params)
563
564     if header_end == -1:
565         # we find no header, so we just mark this fact and pass on
566         # the stream verbatim
567         stream.unget(chunk)
568         return (RAW, {}, stream)
569
570     header = chunk[:header_end]
571
572     # here we place any excess chunk back onto the stream, as
573     # well as throwing away the CRLFCRLF bytes from above.
574     stream.unget(chunk[header_end + 4:])
575
576     TYPE = RAW
577     outdict = {}
578
579     # Eliminate blank lines
580     for line in header.split('\r\n'):
581         # This terminology ("main value" and "dictionary of
582         # parameters") is from the Python docs.
583         try:
584             name, (value, params) = _parse_header(line)
585         except:
586             continue
587
588         if name == 'content-disposition':
589             TYPE = FIELD
590             if params.get('filename'):
591                 TYPE = FILE