Ticket #2070: 5065-streaming_file_upload_with_shutils_2.diff

File 5065-streaming_file_upload_with_shutils_2.diff, 27.5 KB (added by axiak@…, 17 years ago)

Works in [5065], renamed settings variable, uses global settings, defaults STREAMING_MIN_POST_SIZE to .5MB (please test though!)

  • django/http/__init__.py

     
    1 import os
     1import os, pickle
    22from Cookie import SimpleCookie
    33from pprint import pformat
    44from urllib import urlencode, quote
    55from django.utils.datastructures import MultiValueDict
    66
     7try:
     8    from cStringIO import StringIO
     9except ImportError:
     10    from StringIO import StringIO
     11
    712RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
    813
    914try:
     
    4247    def is_secure(self):
    4348        return os.environ.get("HTTPS") == "on"
    4449
    45 def parse_file_upload(header_dict, post_data):
    46     "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
    47     import email, email.Message
    48     from cgi import parse_header
    49     raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
    50     raw_message += '\r\n\r\n' + post_data
    51     msg = email.message_from_string(raw_message)
    52     POST = MultiValueDict()
    53     FILES = MultiValueDict()
    54     for submessage in msg.get_payload():
    55         if submessage and isinstance(submessage, email.Message.Message):
    56             name_dict = parse_header(submessage['Content-Disposition'])[1]
    57             # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
    58             # or {'name': 'blah'} for POST fields
    59             # We assume all uploaded files have a 'filename' set.
    60             if name_dict.has_key('filename'):
    61                 assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
    62                 if not name_dict['filename'].strip():
    63                     continue
    64                 # IE submits the full path, so trim everything but the basename.
    65                 # (We can't use os.path.basename because it expects Linux paths.)
    66                 filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
    67                 FILES.appendlist(name_dict['name'], {
    68                     'filename': filename,
    69                     'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
    70                     'content': submessage.get_payload(),
    71                 })
    72             else:
    73                 POST.appendlist(name_dict['name'], submessage.get_payload())
    74     return POST, FILES
     50def parse_file_upload(headers, input):
     51    from django.conf import settings
    7552
     53    # Only stream files to disk if FILE_STREAMING_DIR is set
     54    file_upload_dir = settings.FILE_UPLOAD_DIR
     55    streaming_min_post_size = settings.STREAMING_MIN_POST_SIZE
     56
     57    try:
     58        parser = MultiPartParser(headers, input, file_upload_dir, streaming_min_post_size)
     59        return parser.parse()
     60    except MultiPartParserError, e:
     61        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
     62
     63class MultiPartParserError(Exception):
     64    def __init__(self, message):
     65        self.message = message
     66    def __str__(self):
     67        return repr(self.message)
     68       
     69class MultiPartParser(object):
     70    """
     71    A rfc2388 multipart/form-data parser.
     72   
     73    parse() reads the input stream in chunk_size chunks and returns a
     74    tuple of (POST MultiValueDict, FILES MultiValueDict). If
     75    file_upload_dir is defined files will be streamed to temporary
     76    files in the specified directory.
     77
     78    The FILES dictionary will have 'filename', 'content-type',
     79    'content' and 'content-length' entries. For streamed files it will
     80    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
     81    only be read from disk when referenced for streamed files.
     82
     83    If the header X-Progress-ID is sent with a 32 character hex string
     84    a temporary file with the same name will be created in
     85    `file_upload_dir`` with a pickled { 'received', 'size' }
     86    dictionary with the number of bytes received and the size expected
     87    respectively. The file will be unlinked when the parser finishes.
     88
     89    """
     90
     91    def __init__(self, headers, input, file_upload_dir=None, streaming_min_post_size=None, chunk_size=1024*64):
     92        try:
     93            content_length = int(headers['Content-Length'])
     94        except:
     95            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
     96
     97        content_type = headers.get('Content-Type')
     98
     99        if not content_type or not content_type.startswith('multipart/'):
     100            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
     101           
     102        ctype, opts = self.parse_header(content_type)
     103        boundary = opts.get('boundary')
     104        from cgi import valid_boundary
     105        if not boundary or not valid_boundary(boundary):
     106            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
     107
     108        # check if we got a valid X-Progress-ID id
     109        progress_id = headers.get('X-Progress-ID')
     110        if file_upload_dir and progress_id:
     111            import re
     112            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
     113                self._progress_filename = os.path.join(file_upload_dir, progress_id)
     114                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
     115        else:
     116            self._progress_filename = None
     117        self._boundary = '--' + boundary
     118        self._input = input
     119        self._size = content_length
     120        self._received = 0
     121        self._file_upload_dir = file_upload_dir
     122        self._chunk_size = chunk_size
     123        self._state = 'PREAMBLE'
     124        self._partial = ''
     125        self._post = MultiValueDict()
     126        self._files = MultiValueDict()
     127
     128        if streaming_min_post_size is not None and content_length < streaming_min_post_size:
     129            self._file_upload_dir = None # disable file streaming for small request
     130
     131        try:
     132            # use mx fast string search if available
     133            from mx.TextTools import FS
     134            self._fs = FS(self._boundary)
     135        except ImportError:
     136            self._fs = None
     137
     138    def parse(self):
     139        try:
     140            self._parse()
     141        finally:
     142            if self._progress_filename:
     143                try:
     144                    os.unlink(self._progress_filename)
     145                except OSError:
     146                    pass
     147       
     148        return self._post, self._files
     149
     150    def _parse(self):
     151        size = self._size
     152
     153        try:
     154            while size > 0:
     155                n = self._read(self._input, min(self._chunk_size, size))
     156                if not n:
     157                    break
     158                size -= n
     159        except:
     160            # consume any remaining data so we dont generate a "Connection Reset" error
     161            size = self._size - self._received
     162            while size > 0:
     163                data = self._input.read(min(self._chunk_size, size))
     164                size -= len(data)
     165            raise
     166
     167    def _find_boundary(self, data, start, stop):
     168        """
     169        Find the next boundary and return the end of current part
     170        and start of next part.
     171        """
     172        if self._fs:
     173            boundary = self._fs.find(data, start, stop)
     174        else:
     175            boundary = data.find(self._boundary, start, stop)
     176        if boundary >= 0:
     177            end = boundary
     178            next = boundary + len(self._boundary)
     179
     180            # backup over CRLF
     181            if end > 0 and data[end-1] == '\n': end -= 1
     182            if end > 0 and data[end-1] == '\r': end -= 1
     183            # skip over --CRLF
     184            if next < stop and data[next] == '-': next += 1
     185            if next < stop and data[next] == '-': next += 1
     186            if next < stop and data[next] == '\r': next += 1
     187            if next < stop and data[next] == '\n': next += 1
     188
     189            return True, end, next
     190        else:
     191            return False, stop, stop
     192
     193    class TemporaryFile(object):
     194        "A temporary file that tries to delete itself when garbage collected."
     195        def __init__(self, dir):
     196            import tempfile
     197            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
     198            self.file = os.fdopen(fd, 'w+b')
     199            self.name = name
     200
     201        def __getattr__(self, name):
     202            a = getattr(self.__dict__['file'], name)
     203            if type(a) != type(0):
     204                setattr(self, name, a)
     205            return a
     206
     207        def __del__(self):
     208            try:
     209                os.unlink(self.name)
     210            except OSError:
     211                pass
     212           
     213    class LazyContent(dict):
     214        """
     215        A lazy FILES dictionary entry that reads the contents from
     216        tmpfile only when referenced.
     217        """
     218        def __init__(self, data):
     219            dict.__init__(self, data)
     220       
     221        def __getitem__(self, key):
     222            if key == 'content' and not self.has_key(key):
     223                self['tmpfile'].seek(0)
     224                self['content'] = self['tmpfile'].read()
     225            return dict.__getitem__(self, key)
     226
     227    def _read(self, input, size):
     228        data = input.read(size)
     229
     230        if not data:
     231            return 0
     232
     233        read_size = len(data)
     234        self._received += read_size
     235
     236        if self._partial:
     237            data = self._partial + data
     238
     239        start = 0
     240        stop = len(data)
     241       
     242        while start < stop:
     243            boundary, end, next = self._find_boundary(data, start, stop)
     244
     245            if not boundary and read_size:
     246                # make sure we dont treat a partial boundary (and its separators) as data
     247                stop -= len(self._boundary) + 16
     248                end = next = stop
     249                if end <= start:
     250                    break # need more data
     251
     252            if self._state == 'PREAMBLE':
     253                # Preamble, just ignore it
     254                self._state = 'HEADER'
     255
     256            elif self._state == 'HEADER':
     257                # Beginning of header, look for end of header and parse it if found.
     258
     259                header_end = data.find('\r\n\r\n', start, stop)
     260                if header_end == -1:
     261                    break # need more data
     262
     263                header = data[start:header_end]
     264
     265                self._fieldname = None
     266                self._filename = None
     267                self._content_type = None
     268
     269                for line in header.split('\r\n'):
     270                    ctype, opts = self.parse_header(line)
     271                    if ctype == 'content-disposition: form-data':
     272                        self._fieldname = opts.get('name')
     273                        self._filename = opts.get('filename')
     274                    elif ctype.startswith('content-type: '):
     275                        self._content_type = ctype[14:]
     276
     277                if self._filename is not None:
     278                    # cleanup filename from IE full paths:
     279                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
     280
     281                    if self._filename: # ignore files without filenames
     282                        if self._file_upload_dir:
     283                            try:
     284                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
     285                            except:
     286                                raise MultiPartParserError("Failed to create temporary file.")
     287                        else:
     288                            self._file = StringIO()
     289                    else:
     290                        self._file = None
     291                    self._filesize = 0
     292                    self._state = 'FILE'
     293                else:
     294                    self._field = StringIO()
     295                    self._state = 'FIELD'
     296                next = header_end + 4
     297
     298            elif self._state == 'FIELD':
     299                # In a field, collect data until a boundary is found.
     300
     301                self._field.write(data[start:end])
     302                if boundary:
     303                    if self._fieldname:
     304                        self._post.appendlist(self._fieldname, self._field.getvalue())
     305                    self._field.close()
     306                    self._state = 'HEADER'
     307
     308            elif self._state == 'FILE':
     309                # In a file, collect data until a boundary is found.
     310
     311                if self._file:
     312                    try:
     313                        self._file.write(data[start:end])
     314                    except IOError, e:
     315                        raise MultiPartParserError("Failed to write to temporary file.")
     316                    self._filesize += end-start
     317
     318                    if self._progress_filename:
     319                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
     320                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
     321                        f.close()
     322
     323                if boundary:
     324                    if self._file:
     325                        if self._file_upload_dir:
     326                            self._file.seek(0)
     327                            file = self.LazyContent({
     328                                'filename': self._filename,
     329                                'content-type':  self._content_type,
     330                                # 'content': is read on demand
     331                                'content-length': self._filesize,
     332                                'tmpfilename': self._file.name,
     333                                'tmpfile': self._file
     334                            })
     335                        else:
     336                            file = {
     337                                'filename': self._filename,
     338                                'content-type':  self._content_type,
     339                                'content': self._file.getvalue(),
     340                                'content-length': self._filesize
     341                            }
     342                            self._file.close()
     343
     344                        self._files.appendlist(self._fieldname, file)
     345
     346                    self._state = 'HEADER'
     347
     348            start = next
     349               
     350        self._partial = data[start:]
     351
     352        return read_size
     353
     354    def parse_header(self, line):
     355        from cgi import parse_header
     356        return parse_header(line)
     357
     358
     359
    76360class QueryDict(MultiValueDict):
    77361    """A specialized MultiValueDict that takes a query string when initialized.
    78362    This is immutable unless you create a copy of it."""
     
    306590    if not host:
    307591        host = request.META.get('HTTP_HOST', '')
    308592    return host
     593
  • django/conf/global_settings.py

     
    240240# isExistingURL validator.
    241241URL_VALIDATOR_USER_AGENT = "Django/0.96pre (http://www.djangoproject.com)"
    242242
     243# The directory to place streamed file uploads. The web server needs write
     244# permissions on this directory.
     245# If this is None, streaming uploads are disabled.
     246FILE_UPLOAD_DIR = None
     247
     248
     249# The minimum size of a POST before file uploads are streamed to disk.
     250# Any less than this number, and the file is uploaded to memory.
     251# Size is in bytes.
     252STREAMING_MIN_POST_SIZE = 512 * (2**10)
     253
     254
     255
     256
    243257##############
    244258# MIDDLEWARE #
    245259##############
     
    329343
    330344# The list of directories to search for fixtures
    331345FIXTURE_DIRS = ()
     346
     347
  • django/db/models/base.py

     
    1717import types
    1818import sys
    1919import os
     20import shutils
    2021
    2122class ModelBase(type):
    2223    "Metaclass for all models"
     
    361362    def _get_FIELD_size(self, field):
    362363        return os.path.getsize(self._get_FIELD_filename(field))
    363364
     365    def _save_FIELD_file(self, field, filename, raw_field, save=True):
    364366    def _save_FIELD_file(self, field, filename, raw_contents, save=True):
    365367        directory = field.get_directory_name()
    366368        try: # Create the date-based directory if it doesn't exist.
     
    383385        setattr(self, field.attname, filename)
    384386
    385387        full_filename = self._get_FIELD_filename(field)
    386         fp = open(full_filename, 'wb')
    387         fp.write(raw_contents)
    388         fp.close()
     388        if raw_field.has_key('tmpfilename'):
     389            raw_field['tmpfile'].close()
     390            shutils.move(raw_field['tmpfilename'], full_filename)
     391        else:
     392            fp = open(full_filename, 'wb')
     393            fp.write(raw_field['content'])
     394            fp.close()
    389395
    390396        # Save the width and/or height, if applicable.
    391397        if isinstance(field, ImageField) and (field.width_field or field.height_field):
  • django/db/models/fields/__init__.py

     
    636636        setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
    637637        setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
    638638        setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
    639         setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents, save=True: instance._save_FIELD_file(self, filename, raw_contents, save))
     639        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field, save=True: instance._save_FIELD_file(self, filename, raw_field, save))
    640640        dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
    641641
    642642    def delete_file(self, instance):
     
    659659        if new_data.get(upload_field_name, False):
    660660            func = getattr(new_object, 'save_%s_file' % self.name)
    661661            if rel:
    662                 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"], save)
     662                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0], save)
    663663            else:
    664                 func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"], save)
     664                func(new_data[upload_field_name]["filename"], new_data[upload_field_name], save)
    665665
    666666    def get_directory_name(self):
    667667        return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))
  • django/oldforms/__init__.py

     
    666666        self.validator_list = [self.isNonEmptyFile] + validator_list
    667667
    668668    def isNonEmptyFile(self, field_data, all_data):
    669         try:
    670             content = field_data['content']
    671         except TypeError:
     669        if field_data.has_key('_file_upload_error'):
     670            raise validators.CriticalValidationError, field_data['_file_upload_error']
     671        if not field_data.has_key('filename'):
    672672            raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
    673         if not content:
     673        if not field_data['content-length']:
    674674            raise validators.CriticalValidationError, gettext("The submitted file is empty.")
    675675
    676676    def render(self, data):
    677677        return '<input type="file" id="%s" class="v%s" name="%s" />' % \
    678678            (self.get_id(), self.__class__.__name__, self.field_name)
    679679
     680    def prepare(self, new_data):
     681        if new_data.has_key('_file_upload_error'):
     682            # pretend we got something in the field to raise a validation error later
     683            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
     684
    680685    def html2python(data):
    681686        if data is None:
    682687            raise EmptyValue
  • django/core/handlers/wsgi.py

     
    111111            if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
    112112                header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
    113113                header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
    114                 self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
     114                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
     115                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
     116                try:
     117                    self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
     118                except:
     119                    self._post, self._files = {}, {} # make sure we dont read the input stream again
     120                    raise
     121                self._raw_post_data = None # raw data is not available for streamed multipart messages
    115122            else:
    116123                self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    117124        else:
  • django/core/handlers/modpython.py

     
    4747    def _load_post_and_files(self):
    4848        "Populates self._post and self._files"
    4949        if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
    50             self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
     50            self._raw_post_data = None # raw data is not available for streamed multipart messages
     51            try:
     52                self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
     53            except:
     54                self._post, self._files = {}, {} # make sure we dont read the input stream again
     55                raise
    5156        else:
    5257            self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
    5358
  • tests/modeltests/test_client/views.py

     
    4444
    4545    return HttpResponse(t.render(c))
    4646
     47def post_file_view(request):
     48    "A view that expects a multipart post and returns a file in the context"
     49    t = Template('File {{ file.filename }} received', name='POST Template')
     50    c = Context({'file': request.FILES['file_file']})
     51    return HttpResponse(t.render(c))
     52
    4753def redirect_view(request):
    4854    "A view that redirects all requests to the GET view"
    4955    return HttpResponseRedirect('/test_client/get_view/')
  • tests/modeltests/test_client/models.py

     
    7575        self.assertEqual(response.template.name, "Book template")
    7676        self.assertEqual(response.content, "Blink - Malcolm Gladwell")
    7777
     78    def test_post_file_view(self):
     79        "POST this python file to a view"
     80        import os, tempfile
     81        from django.conf import settings
     82        file = __file__.replace('.pyc', '.py')
     83        for upload_dir in [None, tempfile.gettempdir()]:
     84            settings.FILE_UPLOAD_DIR = upload_dir
     85            post_data = { 'name': file, 'file': open(file) }
     86            response = self.client.post('/test_client/post_file_view/', post_data)
     87            self.failUnless('models.py' in response.context['file']['filename'])
     88            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
     89            if upload_dir:
     90                self.failUnless(response.context['file']['tmpfilename'])
     91
     92
    7893    def test_redirect(self):
    7994        "GET a URL that redirects elsewhere"
    8095        response = self.client.get('/test_client/redirect_view/')
  • tests/modeltests/test_client/urls.py

     
    44urlpatterns = patterns('',
    55    (r'^get_view/$', views.get_view),
    66    (r'^post_view/$', views.post_view),
     7    (r'^post_file_view/$', views.post_file_view),
    78    (r'^raw_post_view/$', views.raw_post_view),
    89    (r'^redirect_view/$', views.redirect_view),
    910    (r'^form_view/$', views.form_view),
  • docs/request_response.txt

     
    7272``FILES``
    7373    A dictionary-like object containing all uploaded files. Each key in
    7474    ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
    75     value in ``FILES`` is a standard Python dictionary with the following three
     75    value in ``FILES`` is a standard Python dictionary with the following four
    7676    keys:
    7777
    7878        * ``filename`` -- The name of the uploaded file, as a Python string.
    7979        * ``content-type`` -- The content type of the uploaded file.
    8080        * ``content`` -- The raw content of the uploaded file.
     81        * ``content-length`` -- The length of the content in bytes.
    8182
     83    If streaming file uploads are enabled two additional keys
     84    describing the uploaded file will be present:
     85
     86        * ``tmpfilename`` -- The filename for the temporary file.
     87        * ``tmpfile`` -- An open file object for the temporary file.
     88
     89    The temporary file will be removed when the request finishes.
     90
     91    Note that accessing ``content`` when streaming uploads are enabled
     92    will read the whole file into memory which may not be what you want.
     93
    8294    Note that ``FILES`` will only contain data if the request method was POST
    8395    and the ``<form>`` that posted to the request had
    8496    ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank
  • docs/settings.txt

     
    437437
    438438.. _Testing Django Applications: ../testing/
    439439
     440FILE_UPLOAD_DIR
     441---------------
     442
     443Default: ``None``
     444
     445Path to a directory where temporary files should be written during
     446file uploads. Leaving this as ``None`` will disable streaming file uploads,
     447and cause all uploaded files to be stored (temporarily) in memory.
     448
     449STREAMING_MIN_POST_SIZE
     450--------------------
     451
     452Default: 524288 (``512*1024``)
     453
     454An integer specifying the minimum number of bytes that has to be
     455received (in a POST) for file upload streaming to take place. Any
     456request smaller than this will be handled in memory.
     457Note: ``FILE_UPLOAD_DIR`` has to be defined to enable streaming.
     458
    440459IGNORABLE_404_ENDS
    441460------------------
    442461
  • docs/forms.txt

     
    475475   new_data = request.POST.copy()
    476476   new_data.update(request.FILES)
    477477
     478Streaming file uploads.
     479-----------------------
     480
     481File uploads will be read into memory by default. This works fine for
     482small to medium sized uploads (from 1MB to 100MB depending on your
     483setup and usage). If you want to support larger uploads you can enable
     484upload streaming where only a small part of the file will be in memory
     485at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
     486setting (see the settings_ document for more details).
     487
     488See `request object`_ for more details about ``request.FILES`` objects
     489with streaming file uploads enabled.
     490
    478491Validators
    479492==========
    480493
     
    693706.. _`generic views`: ../generic_views/
    694707.. _`models API`: ../model-api/
    695708.. _settings: ../settings/
     709.. _request object: ../request_response/#httprequest-objects
Back to Top