Ticket #2070: 4459-streaming-file-upload.diff

File 4459-streaming-file-upload.diff, 25.9 KB (added by Joakim Sernbrant <serbaut@…>, 8 years ago)

Simplified streaming uploads

  • django/http/__init__.py

     
    1 import os
     1import os, pickle
    22from Cookie import SimpleCookie
    33from pprint import pformat
    44from urllib import urlencode, quote
    55from django.utils.datastructures import MultiValueDict
    66
     7try:
     8    from cStringIO import StringIO
     9except ImportError:
     10    from StringIO import StringIO
     11
    712RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
    813
    914try:
     
    4247    def is_secure(self):
    4348        return os.environ.get("HTTPS") == "on"
    4449
    45 def parse_file_upload(header_dict, post_data):
    46     "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
    47     import email, email.Message
    48     from cgi import parse_header
    49     raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
    50     raw_message += '\r\n\r\n' + post_data
    51     msg = email.message_from_string(raw_message)
    52     POST = MultiValueDict()
    53     FILES = MultiValueDict()
    54     for submessage in msg.get_payload():
    55         if isinstance(submessage, email.Message.Message):
    56             name_dict = parse_header(submessage['Content-Disposition'])[1]
    57             # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
    58             # or {'name': 'blah'} for POST fields
    59             # We assume all uploaded files have a 'filename' set.
    60             if name_dict.has_key('filename'):
    61                 assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
    62                 if not name_dict['filename'].strip():
    63                     continue
    64                 # IE submits the full path, so trim everything but the basename.
    65                 # (We can't use os.path.basename because it expects Linux paths.)
    66                 filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
    67                 FILES.appendlist(name_dict['name'], {
    68                     'filename': filename,
    69                     'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
    70                     'content': submessage.get_payload(),
    71                 })
     50def parse_file_upload(headers, input):
     51    from django.conf import settings
     52
     53    # Only stream files to disk if FILE_UPLOAD_DIR is set
     54    file_upload_dir = getattr(settings, 'FILE_UPLOAD_DIR', None)
     55
     56    try:
     57        parser = MultiPartParser(headers, input, file_upload_dir)
     58        return parser.parse()
     59    except MultiPartParserError, e:
     60        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
     61    except Exception:
     62        return MultiValueDict({ '_file_upload_error': ["An unexpected error occured."] }), {}
     63
     64class MultiPartParserError(Exception):
     65    def __init__(self, message):
     66        self.message = message
     67    def __str__(self):
     68        return repr(self.message)
     69       
     70class MultiPartParser(object):
     71    """
     72    A rfc2388 multipart/form-data parser.
     73   
     74    parse() reads the input stream in chunk_size chunks and returns a
     75    tuple of (POST MultiValueDict, FILES MultiValueDict). If
     76    file_upload_dir is defined files will be streamed to temporary
     77    files in the specified directory.
     78
     79    The FILES dictionary will have 'filename', 'content-type',
     80    'content' and 'content-length' entries. For streamed files it will
     81    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
     82    only be read from disk when referenced for streamed files.
     83
     84    If the header X-Progress-ID is sent with a 32 character hex string
     85    a temporary file with the same name will be created in
     86    `file_upload_dir`` with a pickled { 'received', 'size' }
     87    dictionary with the number of bytes received and the size expected
     88    respectively. The file will be unlinked when the parser finishes.
     89
     90    """
     91
     92    def __init__(self, headers, input, file_upload_dir=None, file_upload_max_size=None, chunk_size=1024*64):
     93        try:
     94            content_length = int(headers['Content-Length'])
     95        except:
     96            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
     97
     98        content_type = headers.get('Content-Type')
     99
     100        if not content_type or not content_type.startswith('multipart/'):
     101            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
     102           
     103        ctype, opts = self.parse_header(content_type)
     104        boundary = opts.get('boundary')
     105        from cgi import valid_boundary
     106        if not boundary or not valid_boundary(boundary):
     107            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
     108
     109        # check if we got a valid X-Progress-ID id
     110        progress_id = headers.get('X-Progress-ID')
     111        if file_upload_dir and progress_id:
     112            import re
     113            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
     114                self._progress_filename = os.path.join(file_upload_dir, progress_id)
    72115            else:
    73                 POST.appendlist(name_dict['name'], submessage.get_payload())
    74     return POST, FILES
     116                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
     117        else:
     118            self._progress_filename = None
    75119
     120        self._boundary = '--' + boundary
     121        self._input = input
     122        self._size = content_length
     123        self._received = 0
     124        self._file_upload_dir = file_upload_dir
     125        self._chunk_size = chunk_size
     126        self._state = 'PREAMBLE'
     127        self._partial = ''
     128        self._post = MultiValueDict()
     129        self._files = MultiValueDict()
     130
     131        try:
     132            # use mx fast string search if available
     133            from mx.TextTools import FS
     134            self._fs = FS(self._boundary)
     135        except ImportError:
     136            self._fs = None
     137
     138    def parse(self):
     139        try:
     140            self._parse()
     141        finally:
     142            if self._progress_filename:
     143                try:
     144                    os.unlink(self._progress_filename)
     145                except OSError:
     146                    pass
     147       
     148        return self._post, self._files
     149
     150    def _parse(self):
     151        size = self._size
     152
     153        try:
     154            while size > 0:
     155                n = self._read(self._input, min(self._chunk_size, size))
     156                if not n:
     157                    break
     158                size -= n
     159        except:
     160            # consume any remaining data so we dont generate a "Connection Reset" error
     161            size = self._size - self._received
     162            while size > 0:
     163                data = self._input.read(min(self._chunk_size, size))
     164                size -= len(data)
     165            raise
     166
     167    def _find_boundary(self, data, start, stop):
     168        """
     169        Find the next boundary and return the end of current part
     170        and start of next part.
     171        """
     172        if self._fs:
     173            boundary = self._fs.find(data, start, stop)
     174        else:
     175            boundary = data.find(self._boundary, start, stop)
     176        if boundary >= 0:
     177            end = boundary
     178            next = boundary + len(self._boundary)
     179
     180            # backup over CRLF
     181            if end > 0 and data[end-1] == '\n': end -= 1
     182            if end > 0 and data[end-1] == '\r': end -= 1
     183            # skip over --CRLF
     184            if next < stop and data[next] == '-': next += 1
     185            if next < stop and data[next] == '-': next += 1
     186            if next < stop and data[next] == '\r': next += 1
     187            if next < stop and data[next] == '\n': next += 1
     188
     189            return True, end, next
     190        else:
     191            return False, stop, stop
     192
     193    class TemporaryFile(object):
     194        "A temporary file that tries to delete itself when garbage collected."
     195        def __init__(self, dir):
     196            import tempfile
     197            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
     198            self.file = os.fdopen(fd, 'w+b')
     199            self.name = name
     200
     201        def __getattr__(self, name):
     202            a = getattr(self.__dict__['file'], name)
     203            if type(a) != type(0):
     204                setattr(self, name, a)
     205            return a
     206
     207        def __del__(self):
     208            try:
     209                os.unlink(self.name)
     210            except OSError:
     211                pass
     212           
     213    class LazyContent(dict):
     214        """
     215        A lazy FILES dictionary entry that reads the contents from
     216        tmpfile only when referenced.
     217        """
     218        def __init__(self, data):
     219            dict.__init__(self, data)
     220       
     221        def __getitem__(self, key):
     222            if key == 'content' and not self.has_key(key):
     223                self['tmpfile'].seek(0)
     224                self['content'] = self['tmpfile'].read()
     225            return dict.__getitem__(self, key)
     226
     227    def _read(self, input, size):
     228        data = input.read(size)
     229
     230        if not data:
     231            return 0
     232
     233        read_size = len(data)
     234        self._received += read_size
     235
     236        if self._partial:
     237            data = self._partial + data
     238
     239        start = 0
     240        stop = len(data)
     241       
     242        while start < stop:
     243            boundary, end, next = self._find_boundary(data, start, stop)
     244
     245            if not boundary and read_size:
     246                # make sure we dont treat a partial boundary (and its separators) as data
     247                stop -= len(self._boundary) + 16
     248                end = next = stop
     249                if end <= start:
     250                    break # need more data
     251
     252            if self._state == 'PREAMBLE':
     253                # Preamble, just ignore it
     254                self._state = 'HEADER'
     255
     256            elif self._state == 'HEADER':
     257                # Beginning of header, look for end of header and parse it if found.
     258
     259                header_end = data.find('\r\n\r\n', start, stop)
     260                if header_end == -1:
     261                    break # need more data
     262
     263                header = data[start:header_end]
     264
     265                self._fieldname = None
     266                self._filename = None
     267                self._content_type = None
     268
     269                for line in header.split('\r\n'):
     270                    ctype, opts = self.parse_header(line)
     271                    if ctype == 'content-disposition: form-data':
     272                        self._fieldname = opts.get('name')
     273                        self._filename = opts.get('filename')
     274                    elif ctype.startswith('content-type: '):
     275                        self._content_type = ctype[14:]
     276
     277                if self._filename is not None:
     278                    # cleanup filename from IE full paths:
     279                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
     280
     281                    if self._filename: # ignore files without filenames
     282                        if self._file_upload_dir:
     283                            try:
     284                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
     285                            except:
     286                                raise MultiPartParserError("Failed to create temporary file.")
     287                        else:
     288                            self._file = StringIO()
     289                    else:
     290                        self._file = None
     291                    self._filesize = 0
     292                    self._state = 'FILE'
     293                else:
     294                    self._field = StringIO()
     295                    self._state = 'FIELD'
     296                next = header_end + 4
     297
     298            elif self._state == 'FIELD':
     299                # In a field, collect data until a boundary is found.
     300
     301                self._field.write(data[start:end])
     302                if boundary:
     303                    if self._fieldname:
     304                        self._post.appendlist(self._fieldname, self._field.getvalue())
     305                    self._field.close()
     306                    self._state = 'HEADER'
     307
     308            elif self._state == 'FILE':
     309                # In a file, collect data until a boundary is found.
     310
     311                if self._file:
     312                    try:
     313                        self._file.write(data[start:end])
     314                    except IOError, e:
     315                        raise MultiPartParserError("Failed to write to temporary file.")
     316                    self._filesize += end-start
     317
     318                    if self._progress_filename:
     319                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
     320                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
     321                        f.close()
     322
     323                if boundary:
     324                    if self._file:
     325                        if self._file_upload_dir:
     326                            self._file.seek(0)
     327                            file = self.LazyContent({
     328                                'filename': self._filename,
     329                                'content-type':  self._content_type,
     330                                # 'content': is read on demand
     331                                'content-length': self._filesize,
     332                                'tmpfilename': self._file.name,
     333                                'tmpfile': self._file
     334                            })
     335                        else:
     336                            file = {
     337                                'filename': self._filename,
     338                                'content-type':  self._content_type,
     339                                'content': self._file.getvalue(),
     340                                'content-length': self._filesize
     341                            }
     342                            self._file.close()
     343
     344                        self._files.appendlist(self._fieldname, file)
     345
     346                    self._state = 'HEADER'
     347
     348            start = next
     349               
     350        self._partial = data[start:]
     351
     352        return read_size
     353
     354    def parse_header(self, line):
     355        from cgi import parse_header
     356        return parse_header(line)
     357
     358
    76359class QueryDict(MultiValueDict):
    77360    """A specialized MultiValueDict that takes a query string when initialized.
    78361    This is immutable unless you create a copy of it."""
     
    302585    if not host:
    303586        host = request.META.get('HTTP_HOST', '')
    304587    return host
     588
  • django/db/models/base.py

     
    321321    def _get_FIELD_size(self, field):
    322322        return os.path.getsize(self._get_FIELD_filename(field))
    323323
    324     def _save_FIELD_file(self, field, filename, raw_contents):
     324    def _save_FIELD_file(self, field, filename, raw_field):
    325325        directory = field.get_directory_name()
    326326        try: # Create the date-based directory if it doesn't exist.
    327327            os.makedirs(os.path.join(settings.MEDIA_ROOT, directory))
     
    343343        setattr(self, field.attname, filename)
    344344
    345345        full_filename = self._get_FIELD_filename(field)
    346         fp = open(full_filename, 'wb')
    347         fp.write(raw_contents)
    348         fp.close()
     346        if raw_field.has_key('tmpfilename'):
     347            raw_field['tmpfile'].close()
     348            os.rename(raw_field['tmpfilename'], full_filename)
     349        else:
     350            fp = open(full_filename, 'wb')
     351            fp.write(raw_field['content'])
     352            fp.close()
    349353
    350354        # Save the width and/or height, if applicable.
    351355        if isinstance(field, ImageField) and (field.width_field or field.height_field):
  • django/db/models/fields/__init__.py

     
    625625        setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
    626626        setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
    627627        setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
    628         setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents: instance._save_FIELD_file(self, filename, raw_contents))
     628        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field: instance._save_FIELD_file(self, filename, raw_field))
    629629        dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
    630630
    631631    def delete_file(self, instance):
     
    648648        if new_data.get(upload_field_name, False):
    649649            func = getattr(new_object, 'save_%s_file' % self.name)
    650650            if rel:
    651                 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"])
     651                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0])
    652652            else:
    653                 func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"])
     653                func(new_data[upload_field_name]["filename"], new_data[upload_field_name])
    654654
    655655    def get_directory_name(self):
    656656        return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))
  • django/oldforms/__init__.py

     
    661661        self.validator_list = [self.isNonEmptyFile] + validator_list
    662662
    663663    def isNonEmptyFile(self, field_data, all_data):
    664         try:
    665             content = field_data['content']
    666         except TypeError:
     664        if field_data.has_key('_file_upload_error'):
     665            raise validators.CriticalValidationError, field_data['_file_upload_error']
     666        if not field_data.has_key('filename'):
    667667            raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
    668         if not content:
     668        if not field_data['content-length']:
    669669            raise validators.CriticalValidationError, gettext("The submitted file is empty.")
    670670
    671671    def render(self, data):
    672672        return '<input type="file" id="%s" class="v%s" name="%s" />' % \
    673673            (self.get_id(), self.__class__.__name__, self.field_name)
    674674
     675    def prepare(self, new_data):
     676        if new_data.has_key('_file_upload_error'):
     677            # pretend we got something in the field to raise a validation error later
     678            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
     679
    675680    def html2python(data):
    676681        if data is None:
    677682            raise EmptyValue
  • django/core/handlers/wsgi.py

     
    111111            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
    112112                header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
    113113                header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
    114                 self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
     114                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
     115                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
     116                self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
     117                self._raw_post_data = None # raw data is not available for streamed multipart messages
    115118            else:
    116119                self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    117120        else:
  • django/core/handlers/modpython.py

     
    4747    def _load_post_and_files(self):
    4848        "Populates self._post and self._files"
    4949        if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
    50             self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
     50            self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
     51            self._raw_post_data = None # raw data is not available for streamed multipart messages
    5152        else:
    5253            self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    5354
  • tests/modeltests/test_client/views.py

     
    2222       
    2323    return HttpResponse(t.render(c))
    2424   
     25def post_file_view(request):
     26    "A view that expects a multipart post and returns a file in the context"
     27    t = Template('File {{ file.filename }} received', name='POST Template')
     28    c = Context({'file': request.FILES['file_file']})
     29    return HttpResponse(t.render(c))
     30
    2531def redirect_view(request):
    2632    "A view that redirects all requests to the GET view"
    2733    return HttpResponseRedirect('/test_client/get_view/')
     
    3238    c = Context({'user': request.user})
    3339   
    3440    return HttpResponse(t.render(c))
    35 login_protected_view = login_required(login_protected_view)
    36  No newline at end of file
     41login_protected_view = login_required(login_protected_view)
  • tests/modeltests/test_client/models.py

     
    6666        self.assertEqual(response.template.name, 'POST Template')
    6767        self.failUnless('Data received' in response.content)
    6868       
     69    def test_post_file_view(self):
     70        "POST this python file to a view"
     71        import os, tempfile
     72        from django.conf import settings
     73        file = __file__.replace('.pyc', '.py')
     74        for upload_dir in [None, tempfile.gettempdir()]:
     75            settings.FILE_UPLOAD_DIR = upload_dir
     76            post_data = { 'name': file, 'file': open(file) }
     77            response = self.client.post('/test_client/post_file_view/', post_data)
     78            self.failUnless('models.py' in response.context['file']['filename'])
     79            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
     80            if upload_dir:
     81                self.failUnless(response.context['file']['tmpfilename'])
     82       
    6983    def test_redirect(self):
    7084        "GET a URL that redirects elsewhere"
    7185        response = self.client.get('/test_client/redirect_view/')
  • tests/modeltests/test_client/urls.py

     
    44urlpatterns = patterns('',
    55    (r'^get_view/$', views.get_view),
    66    (r'^post_view/$', views.post_view),
     7    (r'^post_file_view/$', views.post_file_view),
    78    (r'^redirect_view/$', views.redirect_view),
    89    (r'^login_protected_view/$', views.login_protected_view),
    910)
  • docs/request_response.txt

     
    7272``FILES``
    7373    A dictionary-like object containing all uploaded files. Each key in
    7474    ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
    75     value in ``FILES`` is a standard Python dictionary with the following three
     75    value in ``FILES`` is a standard Python dictionary with the following four
    7676    keys:
    7777
    7878        * ``filename`` -- The name of the uploaded file, as a Python string.
    7979        * ``content-type`` -- The content type of the uploaded file.
    8080        * ``content`` -- The raw content of the uploaded file.
     81        * ``content-length`` -- The length of the content in bytes.
    8182
     83    If streaming file uploads are enabled two additional keys
     84    describing the uploaded file will be present:
     85
     86        * ``tmpfilename`` -- The filename for the temporary file.
     87        * ``tmpfile`` -- An open file object for the temporary file.
     88
     89    The temporary file will be removed when the request finishes.
     90
     91    Note that accessing ``content`` when streaming uploads are enabled
     92    will read the whole file into memory which may not be what you want.
     93
    8294    Note that ``FILES`` will only contain data if the request method was POST
    8395    and the ``<form>`` that posted to the request had
    8496    ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank
  • docs/settings.txt

     
    409409or ``django.core.mail.mail_managers``. You'll probably want to include the
    410410trailing space.
    411411
     412FILE_UPLOAD_DIR
     413---------------
     414
     415Default: Not defined
     416
     417Path to a directory where temporary files should be written during
     418file uploads. Leaving this unset will read files into memory.
     419
     420
    412421IGNORABLE_404_ENDS
    413422------------------
    414423
  • docs/forms.txt

     
    454454   new_data = request.POST.copy()
    455455   new_data.update(request.FILES)
    456456
     457Streaming file uploads.
     458-----------------------
     459
     460File uploads will be read into memory by default. This works fine for
     461small to medium sized uploads (from 1MB to to 100MB depending on your
     462setup and usage). If you want to support larger uploads you can enable
     463upload streaming where only a small part of the file will be in memory
     464at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
     465setting (see the settings_ document for more details).
     466
     467See `request object`_ for more details about ``request.FILES`` objects
     468with streaming file uploads enabled.
     469
    457470Validators
    458471==========
    459472
     
    668681.. _`generic views`: ../generic_views/
    669682.. _`models API`: ../model_api/
    670683.. _settings: ../settings/
     684.. _request object: ../request_response/#httprequest-objects
Back to Top