Ticket #2070: 4459-streaming-file-upload.2.diff

File 4459-streaming-file-upload.2.diff, 26.7 KB (added by Joakim Sernbrant <serbaut@…>, 8 years ago)

Added FILE_UPLOAD_MIN_SIZE (default 100kb) to define minimum request size for streaming to disk. Propage exeptions. I'm not too happy with the names of the settings anymore :/

  • django/http/__init__.py

     
    1 import os
     1import os, pickle
    22from Cookie import SimpleCookie
    33from pprint import pformat
    44from urllib import urlencode, quote
    55from django.utils.datastructures import MultiValueDict
    66
     7try:
     8    from cStringIO import StringIO
     9except ImportError:
     10    from StringIO import StringIO
     11
    712RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
    813
    914try:
     
    4247    def is_secure(self):
    4348        return os.environ.get("HTTPS") == "on"
    4449
    45 def parse_file_upload(header_dict, post_data):
    46     "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
    47     import email, email.Message
    48     from cgi import parse_header
    49     raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
    50     raw_message += '\r\n\r\n' + post_data
    51     msg = email.message_from_string(raw_message)
    52     POST = MultiValueDict()
    53     FILES = MultiValueDict()
    54     for submessage in msg.get_payload():
    55         if isinstance(submessage, email.Message.Message):
    56             name_dict = parse_header(submessage['Content-Disposition'])[1]
    57             # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
    58             # or {'name': 'blah'} for POST fields
    59             # We assume all uploaded files have a 'filename' set.
    60             if name_dict.has_key('filename'):
    61                 assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
    62                 if not name_dict['filename'].strip():
    63                     continue
    64                 # IE submits the full path, so trim everything but the basename.
    65                 # (We can't use os.path.basename because it expects Linux paths.)
    66                 filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
    67                 FILES.appendlist(name_dict['name'], {
    68                     'filename': filename,
    69                     'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
    70                     'content': submessage.get_payload(),
    71                 })
     50def parse_file_upload(headers, input):
     51    from django.conf import settings
     52
     53    # Only stream files to disk if FILE_STREAMING_DIR is set
     54    file_upload_dir = getattr(settings, 'FILE_UPLOAD_DIR', None)
     55    file_upload_min_size = getattr(settings, 'FILE_UPLOAD_MIN_SIZE', 100000)
     56
     57    try:
     58        parser = MultiPartParser(headers, input, file_upload_dir, file_upload_min_size)
     59        return parser.parse()
     60    except MultiPartParserError, e:
     61        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
     62
     63class MultiPartParserError(Exception):
     64    def __init__(self, message):
     65        self.message = message
     66    def __str__(self):
     67        return repr(self.message)
     68       
     69class MultiPartParser(object):
     70    """
     71    A rfc2388 multipart/form-data parser.
     72   
     73    parse() reads the input stream in chunk_size chunks and returns a
     74    tuple of (POST MultiValueDict, FILES MultiValueDict). If
     75    file_upload_dir is defined files will be streamed to temporary
     76    files in the specified directory.
     77
     78    The FILES dictionary will have 'filename', 'content-type',
     79    'content' and 'content-length' entries. For streamed files it will
     80    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
     81    only be read from disk when referenced for streamed files.
     82
     83    If the header X-Progress-ID is sent with a 32 character hex string
     84    a temporary file with the same name will be created in
     85    `file_upload_dir`` with a pickled { 'received', 'size' }
     86    dictionary with the number of bytes received and the size expected
     87    respectively. The file will be unlinked when the parser finishes.
     88
     89    """
     90
     91    def __init__(self, headers, input, file_upload_dir=None, file_upload_min_size=None, chunk_size=1024*64):
     92        try:
     93            content_length = int(headers['Content-Length'])
     94        except:
     95            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
     96
     97        content_type = headers.get('Content-Type')
     98
     99        if not content_type or not content_type.startswith('multipart/'):
     100            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
     101           
     102        ctype, opts = self.parse_header(content_type)
     103        boundary = opts.get('boundary')
     104        from cgi import valid_boundary
     105        if not boundary or not valid_boundary(boundary):
     106            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
     107
     108        # check if we got a valid X-Progress-ID id
     109        progress_id = headers.get('X-Progress-ID')
     110        if file_upload_dir and progress_id:
     111            import re
     112            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
     113                self._progress_filename = os.path.join(file_upload_dir, progress_id)
    72114            else:
    73                 POST.appendlist(name_dict['name'], submessage.get_payload())
    74     return POST, FILES
     115                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
     116        else:
     117            self._progress_filename = None
    75118
     119        self._boundary = '--' + boundary
     120        self._input = input
     121        self._size = content_length
     122        self._received = 0
     123        self._file_upload_dir = file_upload_dir
     124        self._chunk_size = chunk_size
     125        self._state = 'PREAMBLE'
     126        self._partial = ''
     127        self._post = MultiValueDict()
     128        self._files = MultiValueDict()
     129
     130        if file_upload_min_size is not None and content_length < file_upload_min_size:
     131            self._file_upload_dir = None # disable file streaming for small request
     132
     133        try:
     134            # use mx fast string search if available
     135            from mx.TextTools import FS
     136            self._fs = FS(self._boundary)
     137        except ImportError:
     138            self._fs = None
     139
     140    def parse(self):
     141        try:
     142            self._parse()
     143        finally:
     144            if self._progress_filename:
     145                try:
     146                    os.unlink(self._progress_filename)
     147                except OSError:
     148                    pass
     149       
     150        return self._post, self._files
     151
     152    def _parse(self):
     153        size = self._size
     154
     155        try:
     156            while size > 0:
     157                n = self._read(self._input, min(self._chunk_size, size))
     158                if not n:
     159                    break
     160                size -= n
     161        except:
     162            # consume any remaining data so we dont generate a "Connection Reset" error
     163            size = self._size - self._received
     164            while size > 0:
     165                data = self._input.read(min(self._chunk_size, size))
     166                size -= len(data)
     167            raise
     168
     169    def _find_boundary(self, data, start, stop):
     170        """
     171        Find the next boundary and return the end of current part
     172        and start of next part.
     173        """
     174        if self._fs:
     175            boundary = self._fs.find(data, start, stop)
     176        else:
     177            boundary = data.find(self._boundary, start, stop)
     178        if boundary >= 0:
     179            end = boundary
     180            next = boundary + len(self._boundary)
     181
     182            # backup over CRLF
     183            if end > 0 and data[end-1] == '\n': end -= 1
     184            if end > 0 and data[end-1] == '\r': end -= 1
     185            # skip over --CRLF
     186            if next < stop and data[next] == '-': next += 1
     187            if next < stop and data[next] == '-': next += 1
     188            if next < stop and data[next] == '\r': next += 1
     189            if next < stop and data[next] == '\n': next += 1
     190
     191            return True, end, next
     192        else:
     193            return False, stop, stop
     194
     195    class TemporaryFile(object):
     196        "A temporary file that tries to delete itself when garbage collected."
     197        def __init__(self, dir):
     198            import tempfile
     199            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
     200            self.file = os.fdopen(fd, 'w+b')
     201            self.name = name
     202
     203        def __getattr__(self, name):
     204            a = getattr(self.__dict__['file'], name)
     205            if type(a) != type(0):
     206                setattr(self, name, a)
     207            return a
     208
     209        def __del__(self):
     210            try:
     211                os.unlink(self.name)
     212            except OSError:
     213                pass
     214           
     215    class LazyContent(dict):
     216        """
     217        A lazy FILES dictionary entry that reads the contents from
     218        tmpfile only when referenced.
     219        """
     220        def __init__(self, data):
     221            dict.__init__(self, data)
     222       
     223        def __getitem__(self, key):
     224            if key == 'content' and not self.has_key(key):
     225                self['tmpfile'].seek(0)
     226                self['content'] = self['tmpfile'].read()
     227            return dict.__getitem__(self, key)
     228
     229    def _read(self, input, size):
     230        data = input.read(size)
     231
     232        if not data:
     233            return 0
     234
     235        read_size = len(data)
     236        self._received += read_size
     237
     238        if self._partial:
     239            data = self._partial + data
     240
     241        start = 0
     242        stop = len(data)
     243       
     244        while start < stop:
     245            boundary, end, next = self._find_boundary(data, start, stop)
     246
     247            if not boundary and read_size:
     248                # make sure we dont treat a partial boundary (and its separators) as data
     249                stop -= len(self._boundary) + 16
     250                end = next = stop
     251                if end <= start:
     252                    break # need more data
     253
     254            if self._state == 'PREAMBLE':
     255                # Preamble, just ignore it
     256                self._state = 'HEADER'
     257
     258            elif self._state == 'HEADER':
     259                # Beginning of header, look for end of header and parse it if found.
     260
     261                header_end = data.find('\r\n\r\n', start, stop)
     262                if header_end == -1:
     263                    break # need more data
     264
     265                header = data[start:header_end]
     266
     267                self._fieldname = None
     268                self._filename = None
     269                self._content_type = None
     270
     271                for line in header.split('\r\n'):
     272                    ctype, opts = self.parse_header(line)
     273                    if ctype == 'content-disposition: form-data':
     274                        self._fieldname = opts.get('name')
     275                        self._filename = opts.get('filename')
     276                    elif ctype.startswith('content-type: '):
     277                        self._content_type = ctype[14:]
     278
     279                if self._filename is not None:
     280                    # cleanup filename from IE full paths:
     281                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
     282
     283                    if self._filename: # ignore files without filenames
     284                        if self._file_upload_dir:
     285                            try:
     286                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
     287                            except:
     288                                raise MultiPartParserError("Failed to create temporary file.")
     289                        else:
     290                            self._file = StringIO()
     291                    else:
     292                        self._file = None
     293                    self._filesize = 0
     294                    self._state = 'FILE'
     295                else:
     296                    self._field = StringIO()
     297                    self._state = 'FIELD'
     298                next = header_end + 4
     299
     300            elif self._state == 'FIELD':
     301                # In a field, collect data until a boundary is found.
     302
     303                self._field.write(data[start:end])
     304                if boundary:
     305                    if self._fieldname:
     306                        self._post.appendlist(self._fieldname, self._field.getvalue())
     307                    self._field.close()
     308                    self._state = 'HEADER'
     309
     310            elif self._state == 'FILE':
     311                # In a file, collect data until a boundary is found.
     312
     313                if self._file:
     314                    try:
     315                        self._file.write(data[start:end])
     316                    except IOError, e:
     317                        raise MultiPartParserError("Failed to write to temporary file.")
     318                    self._filesize += end-start
     319
     320                    if self._progress_filename:
     321                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
     322                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
     323                        f.close()
     324
     325                if boundary:
     326                    if self._file:
     327                        if self._file_upload_dir:
     328                            self._file.seek(0)
     329                            file = self.LazyContent({
     330                                'filename': self._filename,
     331                                'content-type':  self._content_type,
     332                                # 'content': is read on demand
     333                                'content-length': self._filesize,
     334                                'tmpfilename': self._file.name,
     335                                'tmpfile': self._file
     336                            })
     337                        else:
     338                            file = {
     339                                'filename': self._filename,
     340                                'content-type':  self._content_type,
     341                                'content': self._file.getvalue(),
     342                                'content-length': self._filesize
     343                            }
     344                            self._file.close()
     345
     346                        self._files.appendlist(self._fieldname, file)
     347
     348                    self._state = 'HEADER'
     349
     350            start = next
     351               
     352        self._partial = data[start:]
     353
     354        return read_size
     355
     356    def parse_header(self, line):
     357        from cgi import parse_header
     358        return parse_header(line)
     359
     360
    76361class QueryDict(MultiValueDict):
    77362    """A specialized MultiValueDict that takes a query string when initialized.
    78363    This is immutable unless you create a copy of it."""
     
    302587    if not host:
    303588        host = request.META.get('HTTP_HOST', '')
    304589    return host
     590
  • django/oldforms/__init__.py

     
    661661        self.validator_list = [self.isNonEmptyFile] + validator_list
    662662
    663663    def isNonEmptyFile(self, field_data, all_data):
    664         try:
    665             content = field_data['content']
    666         except TypeError:
     664        if field_data.has_key('_file_upload_error'):
     665            raise validators.CriticalValidationError, field_data['_file_upload_error']
     666        if not field_data.has_key('filename'):
    667667            raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
    668         if not content:
     668        if not field_data['content-length']:
    669669            raise validators.CriticalValidationError, gettext("The submitted file is empty.")
    670670
    671671    def render(self, data):
    672672        return '<input type="file" id="%s" class="v%s" name="%s" />' % \
    673673            (self.get_id(), self.__class__.__name__, self.field_name)
    674674
     675    def prepare(self, new_data):
     676        if new_data.has_key('_file_upload_error'):
     677            # pretend we got something in the field to raise a validation error later
     678            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
     679
    675680    def html2python(data):
    676681        if data is None:
    677682            raise EmptyValue
  • django/db/models/base.py

     
    321321    def _get_FIELD_size(self, field):
    322322        return os.path.getsize(self._get_FIELD_filename(field))
    323323
    324     def _save_FIELD_file(self, field, filename, raw_contents):
     324    def _save_FIELD_file(self, field, filename, raw_field):
    325325        directory = field.get_directory_name()
    326326        try: # Create the date-based directory if it doesn't exist.
    327327            os.makedirs(os.path.join(settings.MEDIA_ROOT, directory))
     
    343343        setattr(self, field.attname, filename)
    344344
    345345        full_filename = self._get_FIELD_filename(field)
    346         fp = open(full_filename, 'wb')
    347         fp.write(raw_contents)
    348         fp.close()
     346        if raw_field.has_key('tmpfilename'):
     347            raw_field['tmpfile'].close()
     348            os.rename(raw_field['tmpfilename'], full_filename)
     349        else:
     350            fp = open(full_filename, 'wb')
     351            fp.write(raw_field['content'])
     352            fp.close()
    349353
    350354        # Save the width and/or height, if applicable.
    351355        if isinstance(field, ImageField) and (field.width_field or field.height_field):
  • django/db/models/fields/__init__.py

     
    625625        setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
    626626        setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
    627627        setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
    628         setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents: instance._save_FIELD_file(self, filename, raw_contents))
     628        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field: instance._save_FIELD_file(self, filename, raw_field))
    629629        dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
    630630
    631631    def delete_file(self, instance):
     
    648648        if new_data.get(upload_field_name, False):
    649649            func = getattr(new_object, 'save_%s_file' % self.name)
    650650            if rel:
    651                 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"])
     651                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0])
    652652            else:
    653                 func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"])
     653                func(new_data[upload_field_name]["filename"], new_data[upload_field_name])
    654654
    655655    def get_directory_name(self):
    656656        return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))
  • django/core/handlers/wsgi.py

     
    111111            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
    112112                header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
    113113                header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
    114                 self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
     114                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
     115                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
     116                try:
     117                    self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
     118                except:
     119                    self._post, self._files = {}, {} # make sure we dont read the input stream again
     120                    raise
     121                self._raw_post_data = None # raw data is not available for streamed multipart messages
    115122            else:
    116123                self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    117124        else:
  • django/core/handlers/modpython.py

     
    4747    def _load_post_and_files(self):
    4848        "Populates self._post and self._files"
    4949        if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
    50             self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
     50            self._raw_post_data = None # raw data is not available for streamed multipart messages
     51            try:
     52                self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
     53            except:
     54                self._post, self._files = {}, {} # make sure we dont read the input stream again
     55                raise
    5156        else:
    5257            self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    5358
  • tests/modeltests/test_client/views.py

     
    2222       
    2323    return HttpResponse(t.render(c))
    2424   
     25def post_file_view(request):
     26    "A view that expects a multipart post and returns a file in the context"
     27    t = Template('File {{ file.filename }} received', name='POST Template')
     28    c = Context({'file': request.FILES['file_file']})
     29    return HttpResponse(t.render(c))
     30
    2531def redirect_view(request):
    2632    "A view that redirects all requests to the GET view"
    2733    return HttpResponseRedirect('/test_client/get_view/')
     
    3238    c = Context({'user': request.user})
    3339   
    3440    return HttpResponse(t.render(c))
    35 login_protected_view = login_required(login_protected_view)
    36  No newline at end of file
     41login_protected_view = login_required(login_protected_view)
  • tests/modeltests/test_client/models.py

     
    6666        self.assertEqual(response.template.name, 'POST Template')
    6767        self.failUnless('Data received' in response.content)
    6868       
     69    def test_post_file_view(self):
     70        "POST this python file to a view"
     71        import os, tempfile
     72        from django.conf import settings
     73        file = __file__.replace('.pyc', '.py')
     74        for upload_dir in [None, tempfile.gettempdir()]:
     75            settings.FILE_UPLOAD_DIR = upload_dir
     76            post_data = { 'name': file, 'file': open(file) }
     77            response = self.client.post('/test_client/post_file_view/', post_data)
     78            self.failUnless('models.py' in response.context['file']['filename'])
     79            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
     80            if upload_dir:
     81                self.failUnless(response.context['file']['tmpfilename'])
     82       
    6983    def test_redirect(self):
    7084        "GET a URL that redirects elsewhere"
    7185        response = self.client.get('/test_client/redirect_view/')
  • tests/modeltests/test_client/urls.py

     
    44urlpatterns = patterns('',
    55    (r'^get_view/$', views.get_view),
    66    (r'^post_view/$', views.post_view),
     7    (r'^post_file_view/$', views.post_file_view),
    78    (r'^redirect_view/$', views.redirect_view),
    89    (r'^login_protected_view/$', views.login_protected_view),
    910)
  • docs/request_response.txt

     
    7272``FILES``
    7373    A dictionary-like object containing all uploaded files. Each key in
    7474    ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
    75     value in ``FILES`` is a standard Python dictionary with the following three
     75    value in ``FILES`` is a standard Python dictionary with the following four
    7676    keys:
    7777
    7878        * ``filename`` -- The name of the uploaded file, as a Python string.
    7979        * ``content-type`` -- The content type of the uploaded file.
    8080        * ``content`` -- The raw content of the uploaded file.
     81        * ``content-length`` -- The length of the content in bytes.
    8182
     83    If streaming file uploads are enabled two additional keys
     84    describing the uploaded file will be present:
     85
     86        * ``tmpfilename`` -- The filename for the temporary file.
     87        * ``tmpfile`` -- An open file object for the temporary file.
     88
     89    The temporary file will be removed when the request finishes.
     90
     91    Note that accessing ``content`` when streaming uploads are enabled
     92    will read the whole file into memory which may not be what you want.
     93
    8294    Note that ``FILES`` will only contain data if the request method was POST
    8395    and the ``<form>`` that posted to the request had
    8496    ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank
  • docs/settings.txt

     
    409409or ``django.core.mail.mail_managers``. You'll probably want to include the
    410410trailing space.
    411411
     412FILE_UPLOAD_DIR
     413---------------
     414
     415Default: Not defined
     416
     417Path to a directory where temporary files should be written during
     418file uploads. Leaving this unset will read files into memory.
     419
     420FILE_UPLOAD_MIN_SIZE
     421--------------------
     422
     423Default: 100000
     424
     425An integer specifying the minimum number of bytes that has to be
     426received for file upload streaming to take place. Any request smaller
     427than this will be handled in memory. Note: ``FILE_UPLOAD_DIR`` has to
     428be defined to enable streaming.
     429
    412430IGNORABLE_404_ENDS
    413431------------------
    414432
  • docs/forms.txt

     
    454454   new_data = request.POST.copy()
    455455   new_data.update(request.FILES)
    456456
     457Streaming file uploads.
     458-----------------------
     459
     460File uploads will be read into memory by default. This works fine for
     461small to medium sized uploads (from 1MB to 100MB depending on your
     462setup and usage). If you want to support larger uploads you can enable
     463upload streaming where only a small part of the file will be in memory
     464at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
     465setting (see the settings_ document for more details).
     466
     467See `request object`_ for more details about ``request.FILES`` objects
     468with streaming file uploads enabled.
     469
    457470Validators
    458471==========
    459472
     
    668681.. _`generic views`: ../generic_views/
    669682.. _`models API`: ../model_api/
    670683.. _settings: ../settings/
     684.. _request object: ../request_response/#httprequest-objects
Back to Top