Ticket #2070: 5070-streaming-file-upload.diff

File 5070-streaming-file-upload.diff, 26.7 KB (added by Øyvind Saltvik <oyvind@…>, 17 years ago)

Updated to trunk, without changes

  • django/http/__init__.py

     
    1 import os
     1import os, pickle
    22from Cookie import SimpleCookie
    33from pprint import pformat
    44from urllib import urlencode, quote
    55from django.utils.datastructures import MultiValueDict
    66
     7try:
     8    from cStringIO import StringIO
     9except ImportError:
     10    from StringIO import StringIO
     11
    712RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
    813
    914try:
     
    4247    def is_secure(self):
    4348        return os.environ.get("HTTPS") == "on"
    4449
    45 def parse_file_upload(header_dict, post_data):
    46     "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
    47     import email, email.Message
    48     from cgi import parse_header
    49     raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
    50     raw_message += '\r\n\r\n' + post_data
    51     msg = email.message_from_string(raw_message)
    52     POST = MultiValueDict()
    53     FILES = MultiValueDict()
    54     for submessage in msg.get_payload():
    55         if submessage and isinstance(submessage, email.Message.Message):
    56             name_dict = parse_header(submessage['Content-Disposition'])[1]
    57             # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
    58             # or {'name': 'blah'} for POST fields
    59             # We assume all uploaded files have a 'filename' set.
    60             if name_dict.has_key('filename'):
    61                 assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
    62                 if not name_dict['filename'].strip():
    63                     continue
    64                 # IE submits the full path, so trim everything but the basename.
    65                 # (We can't use os.path.basename because it expects Linux paths.)
    66                 filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
    67                 FILES.appendlist(name_dict['name'], {
    68                     'filename': filename,
    69                     'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
    70                     'content': submessage.get_payload(),
    71                 })
     50def parse_file_upload(headers, input):
     51    from django.conf import settings
     52
     53    # Only stream files to disk if FILE_STREAMING_DIR is set
     54    file_upload_dir = getattr(settings, 'FILE_UPLOAD_DIR', None)
     55    file_upload_min_size = getattr(settings, 'FILE_UPLOAD_MIN_SIZE', 100000)
     56
     57    try:
     58        parser = MultiPartParser(headers, input, file_upload_dir, file_upload_min_size)
     59        return parser.parse()
     60    except MultiPartParserError, e:
     61        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
     62
     63class MultiPartParserError(Exception):
     64    def __init__(self, message):
     65        self.message = message
     66    def __str__(self):
     67        return repr(self.message)
     68       
     69class MultiPartParser(object):
     70    """
     71    A rfc2388 multipart/form-data parser.
     72   
     73    parse() reads the input stream in chunk_size chunks and returns a
     74    tuple of (POST MultiValueDict, FILES MultiValueDict). If
     75    file_upload_dir is defined files will be streamed to temporary
     76    files in the specified directory.
     77
     78    The FILES dictionary will have 'filename', 'content-type',
     79    'content' and 'content-length' entries. For streamed files it will
     80    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
     81    only be read from disk when referenced for streamed files.
     82
     83    If the header X-Progress-ID is sent with a 32 character hex string
     84    a temporary file with the same name will be created in
     85    `file_upload_dir`` with a pickled { 'received', 'size' }
     86    dictionary with the number of bytes received and the size expected
     87    respectively. The file will be unlinked when the parser finishes.
     88
     89    """
     90
     91    def __init__(self, headers, input, file_upload_dir=None, file_upload_min_size=None, chunk_size=1024*64):
     92        try:
     93            content_length = int(headers['Content-Length'])
     94        except:
     95            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
     96
     97        content_type = headers.get('Content-Type')
     98
     99        if not content_type or not content_type.startswith('multipart/'):
     100            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
     101           
     102        ctype, opts = self.parse_header(content_type)
     103        boundary = opts.get('boundary')
     104        from cgi import valid_boundary
     105        if not boundary or not valid_boundary(boundary):
     106            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
     107
     108        # check if we got a valid X-Progress-ID id
     109        progress_id = headers.get('X-Progress-ID')
     110        if file_upload_dir and progress_id:
     111            import re
     112            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
     113                self._progress_filename = os.path.join(file_upload_dir, progress_id)
    72114            else:
    73                 POST.appendlist(name_dict['name'], submessage.get_payload())
    74     return POST, FILES
     115                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
     116        else:
     117            self._progress_filename = None
    75118
     119        self._boundary = '--' + boundary
     120        self._input = input
     121        self._size = content_length
     122        self._received = 0
     123        self._file_upload_dir = file_upload_dir
     124        self._chunk_size = chunk_size
     125        self._state = 'PREAMBLE'
     126        self._partial = ''
     127        self._post = MultiValueDict()
     128        self._files = MultiValueDict()
     129
     130        if file_upload_min_size is not None and content_length < file_upload_min_size:
     131            self._file_upload_dir = None # disable file streaming for small request
     132
     133        try:
     134            # use mx fast string search if available
     135            from mx.TextTools import FS
     136            self._fs = FS(self._boundary)
     137        except ImportError:
     138            self._fs = None
     139
     140    def parse(self):
     141        try:
     142            self._parse()
     143        finally:
     144            if self._progress_filename:
     145                try:
     146                    os.unlink(self._progress_filename)
     147                except OSError:
     148                    pass
     149       
     150        return self._post, self._files
     151
     152    def _parse(self):
     153        size = self._size
     154
     155        try:
     156            while size > 0:
     157                n = self._read(self._input, min(self._chunk_size, size))
     158                if not n:
     159                    break
     160                size -= n
     161        except:
     162            # consume any remaining data so we dont generate a "Connection Reset" error
     163            size = self._size - self._received
     164            while size > 0:
     165                data = self._input.read(min(self._chunk_size, size))
     166                size -= len(data)
     167            raise
     168
     169    def _find_boundary(self, data, start, stop):
     170        """
     171        Find the next boundary and return the end of current part
     172        and start of next part.
     173        """
     174        if self._fs:
     175            boundary = self._fs.find(data, start, stop)
     176        else:
     177            boundary = data.find(self._boundary, start, stop)
     178        if boundary >= 0:
     179            end = boundary
     180            next = boundary + len(self._boundary)
     181
     182            # backup over CRLF
     183            if end > 0 and data[end-1] == '\n': end -= 1
     184            if end > 0 and data[end-1] == '\r': end -= 1
     185            # skip over --CRLF
     186            if next < stop and data[next] == '-': next += 1
     187            if next < stop and data[next] == '-': next += 1
     188            if next < stop and data[next] == '\r': next += 1
     189            if next < stop and data[next] == '\n': next += 1
     190
     191            return True, end, next
     192        else:
     193            return False, stop, stop
     194
     195    class TemporaryFile(object):
     196        "A temporary file that tries to delete itself when garbage collected."
     197        def __init__(self, dir):
     198            import tempfile
     199            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
     200            self.file = os.fdopen(fd, 'w+b')
     201            self.name = name
     202
     203        def __getattr__(self, name):
     204            a = getattr(self.__dict__['file'], name)
     205            if type(a) != type(0):
     206                setattr(self, name, a)
     207            return a
     208
     209        def __del__(self):
     210            try:
     211                os.unlink(self.name)
     212            except OSError:
     213                pass
     214           
     215    class LazyContent(dict):
     216        """
     217        A lazy FILES dictionary entry that reads the contents from
     218        tmpfile only when referenced.
     219        """
     220        def __init__(self, data):
     221            dict.__init__(self, data)
     222       
     223        def __getitem__(self, key):
     224            if key == 'content' and not self.has_key(key):
     225                self['tmpfile'].seek(0)
     226                self['content'] = self['tmpfile'].read()
     227            return dict.__getitem__(self, key)
     228
     229    def _read(self, input, size):
     230        data = input.read(size)
     231
     232        if not data:
     233            return 0
     234
     235        read_size = len(data)
     236        self._received += read_size
     237
     238        if self._partial:
     239            data = self._partial + data
     240
     241        start = 0
     242        stop = len(data)
     243       
     244        while start < stop:
     245            boundary, end, next = self._find_boundary(data, start, stop)
     246
     247            if not boundary and read_size:
     248                # make sure we dont treat a partial boundary (and its separators) as data
     249                stop -= len(self._boundary) + 16
     250                end = next = stop
     251                if end <= start:
     252                    break # need more data
     253
     254            if self._state == 'PREAMBLE':
     255                # Preamble, just ignore it
     256                self._state = 'HEADER'
     257
     258            elif self._state == 'HEADER':
     259                # Beginning of header, look for end of header and parse it if found.
     260
     261                header_end = data.find('\r\n\r\n', start, stop)
     262                if header_end == -1:
     263                    break # need more data
     264
     265                header = data[start:header_end]
     266
     267                self._fieldname = None
     268                self._filename = None
     269                self._content_type = None
     270
     271                for line in header.split('\r\n'):
     272                    ctype, opts = self.parse_header(line)
     273                    if ctype == 'content-disposition: form-data':
     274                        self._fieldname = opts.get('name')
     275                        self._filename = opts.get('filename')
     276                    elif ctype.startswith('content-type: '):
     277                        self._content_type = ctype[14:]
     278
     279                if self._filename is not None:
     280                    # cleanup filename from IE full paths:
     281                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
     282
     283                    if self._filename: # ignore files without filenames
     284                        if self._file_upload_dir:
     285                            try:
     286                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
     287                            except:
     288                                raise MultiPartParserError("Failed to create temporary file.")
     289                        else:
     290                            self._file = StringIO()
     291                    else:
     292                        self._file = None
     293                    self._filesize = 0
     294                    self._state = 'FILE'
     295                else:
     296                    self._field = StringIO()
     297                    self._state = 'FIELD'
     298                next = header_end + 4
     299
     300            elif self._state == 'FIELD':
     301                # In a field, collect data until a boundary is found.
     302
     303                self._field.write(data[start:end])
     304                if boundary:
     305                    if self._fieldname:
     306                        self._post.appendlist(self._fieldname, self._field.getvalue())
     307                    self._field.close()
     308                    self._state = 'HEADER'
     309
     310            elif self._state == 'FILE':
     311                # In a file, collect data until a boundary is found.
     312
     313                if self._file:
     314                    try:
     315                        self._file.write(data[start:end])
     316                    except IOError, e:
     317                        raise MultiPartParserError("Failed to write to temporary file.")
     318                    self._filesize += end-start
     319
     320                    if self._progress_filename:
     321                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
     322                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
     323                        f.close()
     324
     325                if boundary:
     326                    if self._file:
     327                        if self._file_upload_dir:
     328                            self._file.seek(0)
     329                            file = self.LazyContent({
     330                                'filename': self._filename,
     331                                'content-type':  self._content_type,
     332                                # 'content': is read on demand
     333                                'content-length': self._filesize,
     334                                'tmpfilename': self._file.name,
     335                                'tmpfile': self._file
     336                            })
     337                        else:
     338                            file = {
     339                                'filename': self._filename,
     340                                'content-type':  self._content_type,
     341                                'content': self._file.getvalue(),
     342                                'content-length': self._filesize
     343                            }
     344                            self._file.close()
     345
     346                        self._files.appendlist(self._fieldname, file)
     347
     348                    self._state = 'HEADER'
     349
     350            start = next
     351               
     352        self._partial = data[start:]
     353
     354        return read_size
     355
     356    def parse_header(self, line):
     357        from cgi import parse_header
     358        return parse_header(line)
     359
     360
    76361class QueryDict(MultiValueDict):
    77362    """A specialized MultiValueDict that takes a query string when initialized.
    78363    This is immutable unless you create a copy of it."""
     
    306591    if not host:
    307592        host = request.META.get('HTTP_HOST', '')
    308593    return host
     594
  • django/db/models/base.py

     
    361361    def _get_FIELD_size(self, field):
    362362        return os.path.getsize(self._get_FIELD_filename(field))
    363363
    364     def _save_FIELD_file(self, field, filename, raw_contents, save=True):
     364    def _save_FIELD_file(self, field, filename, raw_field):
    365365        directory = field.get_directory_name()
    366366        try: # Create the date-based directory if it doesn't exist.
    367367            os.makedirs(os.path.join(settings.MEDIA_ROOT, directory))
     
    383383        setattr(self, field.attname, filename)
    384384
    385385        full_filename = self._get_FIELD_filename(field)
    386         fp = open(full_filename, 'wb')
    387         fp.write(raw_contents)
    388         fp.close()
     386        if raw_field.has_key('tmpfilename'):
     387            raw_field['tmpfile'].close()
     388            os.rename(raw_field['tmpfilename'], full_filename)
     389        else:
     390            fp = open(full_filename, 'wb')
     391            fp.write(raw_field['content'])
     392            fp.close()
    389393
    390394        # Save the width and/or height, if applicable.
    391395        if isinstance(field, ImageField) and (field.width_field or field.height_field):
  • django/db/models/fields/__init__.py

     
    636636        setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
    637637        setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
    638638        setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
    639         setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents, save=True: instance._save_FIELD_file(self, filename, raw_contents, save))
     639        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field: instance._save_FIELD_file(self, filename, raw_field))
    640640        dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
    641641
    642642    def delete_file(self, instance):
     
    659659        if new_data.get(upload_field_name, False):
    660660            func = getattr(new_object, 'save_%s_file' % self.name)
    661661            if rel:
    662                 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"], save)
     662                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0])
    663663            else:
    664                 func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"], save)
     664                func(new_data[upload_field_name]["filename"], new_data[upload_field_name])
    665665
    666666    def get_directory_name(self):
    667667        return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))
  • django/oldforms/__init__.py

     
    666666        self.validator_list = [self.isNonEmptyFile] + validator_list
    667667
    668668    def isNonEmptyFile(self, field_data, all_data):
    669         try:
    670             content = field_data['content']
    671         except TypeError:
     669        if field_data.has_key('_file_upload_error'):
     670            raise validators.CriticalValidationError, field_data['_file_upload_error']
     671        if not field_data.has_key('filename'):
    672672            raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
    673         if not content:
     673        if not field_data['content-length']:
    674674            raise validators.CriticalValidationError, gettext("The submitted file is empty.")
    675675
    676676    def render(self, data):
    677677        return '<input type="file" id="%s" class="v%s" name="%s" />' % \
    678678            (self.get_id(), self.__class__.__name__, self.field_name)
    679679
     680    def prepare(self, new_data):
     681        if new_data.has_key('_file_upload_error'):
     682            # pretend we got something in the field to raise a validation error later
     683            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
     684
    680685    def html2python(data):
    681686        if data is None:
    682687            raise EmptyValue
  • django/core/handlers/wsgi.py

     
    111111            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
    112112                header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
    113113                header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
    114                 self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
     114                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
     115                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
     116                try:
     117                    self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
     118                except:
     119                    self._post, self._files = {}, {} # make sure we dont read the input stream again
     120                    raise
     121                self._raw_post_data = None # raw data is not available for streamed multipart messages
    115122            else:
    116123                self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    117124        else:
  • django/core/handlers/modpython.py

     
    4747    def _load_post_and_files(self):
    4848        "Populates self._post and self._files"
    4949        if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
    50             self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
     50            self._raw_post_data = None # raw data is not available for streamed multipart messages
     51            try:
     52                self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
     53            except:
     54                self._post, self._files = {}, {} # make sure we dont read the input stream again
     55                raise
    5156        else:
    5257            self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    5358
  • tests/modeltests/test_client/views.py

     
    4444
    4545    return HttpResponse(t.render(c))
    4646
     47def post_file_view(request):
     48    "A view that expects a multipart post and returns a file in the context"
     49    t = Template('File {{ file.filename }} received', name='POST Template')
     50    c = Context({'file': request.FILES['file_file']})
     51    return HttpResponse(t.render(c))
     52
    4753def redirect_view(request):
    4854    "A view that redirects all requests to the GET view"
    4955    return HttpResponseRedirect('/test_client/get_view/')
  • tests/modeltests/test_client/models.py

     
    7575        self.assertEqual(response.template.name, "Book template")
    7676        self.assertEqual(response.content, "Blink - Malcolm Gladwell")
    7777
     78    def test_post_file_view(self):
     79        "POST this python file to a view"
     80        import os, tempfile
     81        from django.conf import settings
     82        file = __file__.replace('.pyc', '.py')
     83        for upload_dir in [None, tempfile.gettempdir()]:
     84            settings.FILE_UPLOAD_DIR = upload_dir
     85            post_data = { 'name': file, 'file': open(file) }
     86            response = self.client.post('/test_client/post_file_view/', post_data)
     87            self.failUnless('models.py' in response.context['file']['filename'])
     88            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
     89            if upload_dir:
     90                self.failUnless(response.context['file']['tmpfilename'])
     91
    7892    def test_redirect(self):
    7993        "GET a URL that redirects elsewhere"
    8094        response = self.client.get('/test_client/redirect_view/')
  • tests/modeltests/test_client/urls.py

     
    55    (r'^get_view/$', views.get_view),
    66    (r'^post_view/$', views.post_view),
    77    (r'^raw_post_view/$', views.raw_post_view),
     8    (r'^post_file_view/$', views.post_file_view),
    89    (r'^redirect_view/$', views.redirect_view),
    910    (r'^form_view/$', views.form_view),
    1011    (r'^login_protected_view/$', views.login_protected_view),
  • docs/request_response.txt

     
    7272``FILES``
    7373    A dictionary-like object containing all uploaded files. Each key in
    7474    ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
    75     value in ``FILES`` is a standard Python dictionary with the following three
     75    value in ``FILES`` is a standard Python dictionary with the following four
    7676    keys:
    7777
    7878        * ``filename`` -- The name of the uploaded file, as a Python string.
    7979        * ``content-type`` -- The content type of the uploaded file.
    8080        * ``content`` -- The raw content of the uploaded file.
     81        * ``content-length`` -- The length of the content in bytes.
    8182
     83    If streaming file uploads are enabled two additional keys
     84    describing the uploaded file will be present:
     85
     86        * ``tmpfilename`` -- The filename for the temporary file.
     87        * ``tmpfile`` -- An open file object for the temporary file.
     88
     89    The temporary file will be removed when the request finishes.
     90
     91    Note that accessing ``content`` when streaming uploads are enabled
     92    will read the whole file into memory which may not be what you want.
     93
    8294    Note that ``FILES`` will only contain data if the request method was POST
    8395    and the ``<form>`` that posted to the request had
    8496    ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank
  • docs/settings.txt

     
    437437
    438438.. _Testing Django Applications: ../testing/
    439439
     440FILE_UPLOAD_DIR
     441---------------
     442
     443Default: Not defined
     444
     445Path to a directory where temporary files should be written during
     446file uploads. Leaving this unset will read files into memory.
     447
     448FILE_UPLOAD_MIN_SIZE
     449--------------------
     450
     451Default: 100000
     452
     453An integer specifying the minimum number of bytes that has to be
     454received for file upload streaming to take place. Any request smaller
     455than this will be handled in memory. Note: ``FILE_UPLOAD_DIR`` has to
     456be defined to enable streaming.
     457
    440458IGNORABLE_404_ENDS
    441459------------------
    442460
  • docs/forms.txt

     
    475475   new_data = request.POST.copy()
    476476   new_data.update(request.FILES)
    477477
     478Streaming file uploads.
     479-----------------------
     480
     481File uploads will be read into memory by default. This works fine for
     482small to medium sized uploads (from 1MB to 100MB depending on your
     483setup and usage). If you want to support larger uploads you can enable
     484upload streaming where only a small part of the file will be in memory
     485at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
     486setting (see the settings_ document for more details).
     487
     488See `request object`_ for more details about ``request.FILES`` objects
     489with streaming file uploads enabled.
     490
    478491Validators
    479492==========
    480493
     
    689702    At validation time, the XML fragment is validated against the schema using
    690703    the executable specified in the ``JING_PATH`` setting (see the settings_
    691704    document for more details).
     705.. _request object: ../request_response/#httprequest-objects
    692706
    693707.. _`generic views`: ../generic_views/
    694708.. _`models API`: ../model-api/
Back to Top