Code

Ticket #2070: 4459-streaming-file-upload.diff

File 4459-streaming-file-upload.diff, 25.9 KB (added by Joakim Sernbrant <serbaut@…>, 7 years ago)

Simplified streaming uploads

Line 
1Index: django/http/__init__.py
2===================================================================
3--- django/http/__init__.py     (revision 4459)
4+++ django/http/__init__.py     (working copy)
5@@ -1,9 +1,14 @@
6-import os
7+import os, pickle
8 from Cookie import SimpleCookie
9 from pprint import pformat
10 from urllib import urlencode, quote
11 from django.utils.datastructures import MultiValueDict
12 
13+try:
14+    from cStringIO import StringIO
15+except ImportError:
16+    from StringIO import StringIO
17+
18 RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
19 
20 try:
21@@ -42,37 +47,315 @@
22     def is_secure(self):
23         return os.environ.get("HTTPS") == "on"
24 
25-def parse_file_upload(header_dict, post_data):
26-    "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
27-    import email, email.Message
28-    from cgi import parse_header
29-    raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
30-    raw_message += '\r\n\r\n' + post_data
31-    msg = email.message_from_string(raw_message)
32-    POST = MultiValueDict()
33-    FILES = MultiValueDict()
34-    for submessage in msg.get_payload():
35-        if isinstance(submessage, email.Message.Message):
36-            name_dict = parse_header(submessage['Content-Disposition'])[1]
37-            # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
38-            # or {'name': 'blah'} for POST fields
39-            # We assume all uploaded files have a 'filename' set.
40-            if name_dict.has_key('filename'):
41-                assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
42-                if not name_dict['filename'].strip():
43-                    continue
44-                # IE submits the full path, so trim everything but the basename.
45-                # (We can't use os.path.basename because it expects Linux paths.)
46-                filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
47-                FILES.appendlist(name_dict['name'], {
48-                    'filename': filename,
49-                    'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
50-                    'content': submessage.get_payload(),
51-                })
52+def parse_file_upload(headers, input):
53+    from django.conf import settings
54+
55+    # Only stream files to disk if FILE_UPLOAD_DIR is set
56+    file_upload_dir = getattr(settings, 'FILE_UPLOAD_DIR', None)
57+
58+    try:
59+        parser = MultiPartParser(headers, input, file_upload_dir)
60+        return parser.parse()
61+    except MultiPartParserError, e:
62+        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
63+    except Exception:
64+        return MultiValueDict({ '_file_upload_error': ["An unexpected error occured."] }), {}
65+
66+class MultiPartParserError(Exception):
67+    def __init__(self, message):
68+        self.message = message
69+    def __str__(self):
70+        return repr(self.message)
71+       
72+class MultiPartParser(object):
73+    """
74+    A rfc2388 multipart/form-data parser.
75+   
76+    parse() reads the input stream in chunk_size chunks and returns a
77+    tuple of (POST MultiValueDict, FILES MultiValueDict). If
78+    file_upload_dir is defined files will be streamed to temporary
79+    files in the specified directory.
80+
81+    The FILES dictionary will have 'filename', 'content-type',
82+    'content' and 'content-length' entries. For streamed files it will
83+    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
84+    only be read from disk when referenced for streamed files.
85+
86+    If the header X-Progress-ID is sent with a 32 character hex string
87+    a temporary file with the same name will be created in
88+    `file_upload_dir`` with a pickled { 'received', 'size' }
89+    dictionary with the number of bytes received and the size expected
90+    respectively. The file will be unlinked when the parser finishes.
91+
92+    """
93+
94+    def __init__(self, headers, input, file_upload_dir=None, file_upload_max_size=None, chunk_size=1024*64):
95+        try:
96+            content_length = int(headers['Content-Length'])
97+        except:
98+            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
99+
100+        content_type = headers.get('Content-Type')
101+
102+        if not content_type or not content_type.startswith('multipart/'):
103+            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
104+           
105+        ctype, opts = self.parse_header(content_type)
106+        boundary = opts.get('boundary')
107+        from cgi import valid_boundary
108+        if not boundary or not valid_boundary(boundary):
109+            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
110+
111+        # check if we got a valid X-Progress-ID id
112+        progress_id = headers.get('X-Progress-ID')
113+        if file_upload_dir and progress_id:
114+            import re
115+            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
116+                self._progress_filename = os.path.join(file_upload_dir, progress_id)
117             else:
118-                POST.appendlist(name_dict['name'], submessage.get_payload())
119-    return POST, FILES
120+                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
121+        else:
122+            self._progress_filename = None
123 
124+        self._boundary = '--' + boundary
125+        self._input = input
126+        self._size = content_length
127+        self._received = 0
128+        self._file_upload_dir = file_upload_dir
129+        self._chunk_size = chunk_size
130+        self._state = 'PREAMBLE'
131+        self._partial = ''
132+        self._post = MultiValueDict()
133+        self._files = MultiValueDict()
134+
135+        try:
136+            # use mx fast string search if available
137+            from mx.TextTools import FS
138+            self._fs = FS(self._boundary)
139+        except ImportError:
140+            self._fs = None
141+
142+    def parse(self):
143+        try:
144+            self._parse()
145+        finally:
146+            if self._progress_filename:
147+                try:
148+                    os.unlink(self._progress_filename)
149+                except OSError:
150+                    pass
151+       
152+        return self._post, self._files
153+
154+    def _parse(self):
155+        size = self._size
156+
157+        try:
158+            while size > 0:
159+                n = self._read(self._input, min(self._chunk_size, size))
160+                if not n:
161+                    break
162+                size -= n
163+        except:
164+            # consume any remaining data so we dont generate a "Connection Reset" error
165+            size = self._size - self._received
166+            while size > 0:
167+                data = self._input.read(min(self._chunk_size, size))
168+                size -= len(data)
169+            raise
170+
171+    def _find_boundary(self, data, start, stop):
172+        """
173+        Find the next boundary and return the end of current part
174+        and start of next part.
175+        """
176+        if self._fs:
177+            boundary = self._fs.find(data, start, stop)
178+        else:
179+            boundary = data.find(self._boundary, start, stop)
180+        if boundary >= 0:
181+            end = boundary
182+            next = boundary + len(self._boundary)
183+
184+            # backup over CRLF
185+            if end > 0 and data[end-1] == '\n': end -= 1
186+            if end > 0 and data[end-1] == '\r': end -= 1
187+            # skip over --CRLF
188+            if next < stop and data[next] == '-': next += 1
189+            if next < stop and data[next] == '-': next += 1
190+            if next < stop and data[next] == '\r': next += 1
191+            if next < stop and data[next] == '\n': next += 1
192+
193+            return True, end, next
194+        else:
195+            return False, stop, stop
196+
197+    class TemporaryFile(object):
198+        "A temporary file that tries to delete itself when garbage collected."
199+        def __init__(self, dir):
200+            import tempfile
201+            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
202+            self.file = os.fdopen(fd, 'w+b')
203+            self.name = name
204+
205+        def __getattr__(self, name):
206+            a = getattr(self.__dict__['file'], name)
207+            if type(a) != type(0):
208+                setattr(self, name, a)
209+            return a
210+
211+        def __del__(self):
212+            try:
213+                os.unlink(self.name)
214+            except OSError:
215+                pass
216+           
217+    class LazyContent(dict):
218+        """
219+        A lazy FILES dictionary entry that reads the contents from
220+        tmpfile only when referenced.
221+        """
222+        def __init__(self, data):
223+            dict.__init__(self, data)
224+       
225+        def __getitem__(self, key):
226+            if key == 'content' and not self.has_key(key):
227+                self['tmpfile'].seek(0)
228+                self['content'] = self['tmpfile'].read()
229+            return dict.__getitem__(self, key)
230+
231+    def _read(self, input, size):
232+        data = input.read(size)
233+
234+        if not data:
235+            return 0
236+
237+        read_size = len(data)
238+        self._received += read_size
239+
240+        if self._partial:
241+            data = self._partial + data
242+
243+        start = 0
244+        stop = len(data)
245+       
246+        while start < stop:
247+            boundary, end, next = self._find_boundary(data, start, stop)
248+
249+            if not boundary and read_size:
250+                # make sure we dont treat a partial boundary (and its separators) as data
251+                stop -= len(self._boundary) + 16
252+                end = next = stop
253+                if end <= start:
254+                    break # need more data
255+
256+            if self._state == 'PREAMBLE':
257+                # Preamble, just ignore it
258+                self._state = 'HEADER'
259+
260+            elif self._state == 'HEADER':
261+                # Beginning of header, look for end of header and parse it if found.
262+
263+                header_end = data.find('\r\n\r\n', start, stop)
264+                if header_end == -1:
265+                    break # need more data
266+
267+                header = data[start:header_end]
268+
269+                self._fieldname = None
270+                self._filename = None
271+                self._content_type = None
272+
273+                for line in header.split('\r\n'):
274+                    ctype, opts = self.parse_header(line)
275+                    if ctype == 'content-disposition: form-data':
276+                        self._fieldname = opts.get('name')
277+                        self._filename = opts.get('filename')
278+                    elif ctype.startswith('content-type: '):
279+                        self._content_type = ctype[14:]
280+
281+                if self._filename is not None:
282+                    # cleanup filename from IE full paths:
283+                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
284+
285+                    if self._filename: # ignore files without filenames
286+                        if self._file_upload_dir:
287+                            try:
288+                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
289+                            except:
290+                                raise MultiPartParserError("Failed to create temporary file.")
291+                        else:
292+                            self._file = StringIO()
293+                    else:
294+                        self._file = None
295+                    self._filesize = 0
296+                    self._state = 'FILE'
297+                else:
298+                    self._field = StringIO()
299+                    self._state = 'FIELD'
300+                next = header_end + 4
301+
302+            elif self._state == 'FIELD':
303+                # In a field, collect data until a boundary is found.
304+
305+                self._field.write(data[start:end])
306+                if boundary:
307+                    if self._fieldname:
308+                        self._post.appendlist(self._fieldname, self._field.getvalue())
309+                    self._field.close()
310+                    self._state = 'HEADER'
311+
312+            elif self._state == 'FILE':
313+                # In a file, collect data until a boundary is found.
314+
315+                if self._file:
316+                    try:
317+                        self._file.write(data[start:end])
318+                    except IOError, e:
319+                        raise MultiPartParserError("Failed to write to temporary file.")
320+                    self._filesize += end-start
321+
322+                    if self._progress_filename:
323+                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
324+                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
325+                        f.close()
326+
327+                if boundary:
328+                    if self._file:
329+                        if self._file_upload_dir:
330+                            self._file.seek(0)
331+                            file = self.LazyContent({
332+                                'filename': self._filename,
333+                                'content-type':  self._content_type,
334+                                # 'content': is read on demand
335+                                'content-length': self._filesize,
336+                                'tmpfilename': self._file.name,
337+                                'tmpfile': self._file
338+                            })
339+                        else:
340+                            file = {
341+                                'filename': self._filename,
342+                                'content-type':  self._content_type,
343+                                'content': self._file.getvalue(),
344+                                'content-length': self._filesize
345+                            }
346+                            self._file.close()
347+
348+                        self._files.appendlist(self._fieldname, file)
349+
350+                    self._state = 'HEADER'
351+
352+            start = next
353+               
354+        self._partial = data[start:]
355+
356+        return read_size
357+
358+    def parse_header(self, line):
359+        from cgi import parse_header
360+        return parse_header(line)
361+
362+
363 class QueryDict(MultiValueDict):
364     """A specialized MultiValueDict that takes a query string when initialized.
365     This is immutable unless you create a copy of it."""
366@@ -302,3 +585,4 @@
367     if not host:
368         host = request.META.get('HTTP_HOST', '')
369     return host
370+
371Index: django/db/models/base.py
372===================================================================
373--- django/db/models/base.py    (revision 4459)
374+++ django/db/models/base.py    (working copy)
375@@ -321,7 +321,7 @@
376     def _get_FIELD_size(self, field):
377         return os.path.getsize(self._get_FIELD_filename(field))
378 
379-    def _save_FIELD_file(self, field, filename, raw_contents):
380+    def _save_FIELD_file(self, field, filename, raw_field):
381         directory = field.get_directory_name()
382         try: # Create the date-based directory if it doesn't exist.
383             os.makedirs(os.path.join(settings.MEDIA_ROOT, directory))
384@@ -343,9 +343,13 @@
385         setattr(self, field.attname, filename)
386 
387         full_filename = self._get_FIELD_filename(field)
388-        fp = open(full_filename, 'wb')
389-        fp.write(raw_contents)
390-        fp.close()
391+        if raw_field.has_key('tmpfilename'):
392+            raw_field['tmpfile'].close()
393+            os.rename(raw_field['tmpfilename'], full_filename)
394+        else:
395+            fp = open(full_filename, 'wb')
396+            fp.write(raw_field['content'])
397+            fp.close()
398 
399         # Save the width and/or height, if applicable.
400         if isinstance(field, ImageField) and (field.width_field or field.height_field):
401Index: django/db/models/fields/__init__.py
402===================================================================
403--- django/db/models/fields/__init__.py (revision 4459)
404+++ django/db/models/fields/__init__.py (working copy)
405@@ -625,7 +625,7 @@
406         setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
407         setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
408         setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
409-        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents: instance._save_FIELD_file(self, filename, raw_contents))
410+        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field: instance._save_FIELD_file(self, filename, raw_field))
411         dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
412 
413     def delete_file(self, instance):
414@@ -648,9 +648,9 @@
415         if new_data.get(upload_field_name, False):
416             func = getattr(new_object, 'save_%s_file' % self.name)
417             if rel:
418-                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"])
419+                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0])
420             else:
421-                func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"])
422+                func(new_data[upload_field_name]["filename"], new_data[upload_field_name])
423 
424     def get_directory_name(self):
425         return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))
426Index: django/oldforms/__init__.py
427===================================================================
428--- django/oldforms/__init__.py (revision 4459)
429+++ django/oldforms/__init__.py (working copy)
430@@ -661,17 +661,22 @@
431         self.validator_list = [self.isNonEmptyFile] + validator_list
432 
433     def isNonEmptyFile(self, field_data, all_data):
434-        try:
435-            content = field_data['content']
436-        except TypeError:
437+        if field_data.has_key('_file_upload_error'):
438+            raise validators.CriticalValidationError, field_data['_file_upload_error']
439+        if not field_data.has_key('filename'):
440             raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
441-        if not content:
442+        if not field_data['content-length']:
443             raise validators.CriticalValidationError, gettext("The submitted file is empty.")
444 
445     def render(self, data):
446         return '<input type="file" id="%s" class="v%s" name="%s" />' % \
447             (self.get_id(), self.__class__.__name__, self.field_name)
448 
449+    def prepare(self, new_data):
450+        if new_data.has_key('_file_upload_error'):
451+            # pretend we got something in the field to raise a validation error later
452+            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
453+
454     def html2python(data):
455         if data is None:
456             raise EmptyValue
457Index: django/core/handlers/wsgi.py
458===================================================================
459--- django/core/handlers/wsgi.py        (revision 4459)
460+++ django/core/handlers/wsgi.py        (working copy)
461@@ -111,7 +111,10 @@
462             if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
463                 header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
464                 header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
465-                self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
466+                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
467+                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
468+                self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
469+                self._raw_post_data = None # raw data is not available for streamed multipart messages
470             else:
471                 self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
472         else:
473Index: django/core/handlers/modpython.py
474===================================================================
475--- django/core/handlers/modpython.py   (revision 4459)
476+++ django/core/handlers/modpython.py   (working copy)
477@@ -47,7 +47,8 @@
478     def _load_post_and_files(self):
479         "Populates self._post and self._files"
480         if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
481-            self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
482+            self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
483+            self._raw_post_data = None # raw data is not available for streamed multipart messages
484         else:
485             self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
486 
487Index: tests/modeltests/test_client/views.py
488===================================================================
489--- tests/modeltests/test_client/views.py       (revision 4459)
490+++ tests/modeltests/test_client/views.py       (working copy)
491@@ -22,6 +22,12 @@
492         
493     return HttpResponse(t.render(c))
494     
495+def post_file_view(request):
496+    "A view that expects a multipart post and returns a file in the context"
497+    t = Template('File {{ file.filename }} received', name='POST Template')
498+    c = Context({'file': request.FILES['file_file']})
499+    return HttpResponse(t.render(c))
500+
501 def redirect_view(request):
502     "A view that redirects all requests to the GET view"
503     return HttpResponseRedirect('/test_client/get_view/')
504@@ -32,4 +38,4 @@
505     c = Context({'user': request.user})
506     
507     return HttpResponse(t.render(c))
508-login_protected_view = login_required(login_protected_view)
509\ No newline at end of file
510+login_protected_view = login_required(login_protected_view)
511Index: tests/modeltests/test_client/models.py
512===================================================================
513--- tests/modeltests/test_client/models.py      (revision 4459)
514+++ tests/modeltests/test_client/models.py      (working copy)
515@@ -66,6 +66,20 @@
516         self.assertEqual(response.template.name, 'POST Template')
517         self.failUnless('Data received' in response.content)
518         
519+    def test_post_file_view(self):
520+        "POST this python file to a view"
521+        import os, tempfile
522+        from django.conf import settings
523+        file = __file__.replace('.pyc', '.py')
524+        for upload_dir in [None, tempfile.gettempdir()]:
525+            settings.FILE_UPLOAD_DIR = upload_dir
526+            post_data = { 'name': file, 'file': open(file) }
527+            response = self.client.post('/test_client/post_file_view/', post_data)
528+            self.failUnless('models.py' in response.context['file']['filename'])
529+            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
530+            if upload_dir:
531+                self.failUnless(response.context['file']['tmpfilename'])
532+       
533     def test_redirect(self):
534         "GET a URL that redirects elsewhere"
535         response = self.client.get('/test_client/redirect_view/')
536Index: tests/modeltests/test_client/urls.py
537===================================================================
538--- tests/modeltests/test_client/urls.py        (revision 4459)
539+++ tests/modeltests/test_client/urls.py        (working copy)
540@@ -4,6 +4,7 @@
541 urlpatterns = patterns('',
542     (r'^get_view/$', views.get_view),
543     (r'^post_view/$', views.post_view),
544+    (r'^post_file_view/$', views.post_file_view),
545     (r'^redirect_view/$', views.redirect_view),
546     (r'^login_protected_view/$', views.login_protected_view),
547 )
548Index: docs/request_response.txt
549===================================================================
550--- docs/request_response.txt   (revision 4459)
551+++ docs/request_response.txt   (working copy)
552@@ -72,13 +72,25 @@
553 ``FILES``
554     A dictionary-like object containing all uploaded files. Each key in
555     ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
556-    value in ``FILES`` is a standard Python dictionary with the following three
557+    value in ``FILES`` is a standard Python dictionary with the following four
558     keys:
559 
560         * ``filename`` -- The name of the uploaded file, as a Python string.
561         * ``content-type`` -- The content type of the uploaded file.
562         * ``content`` -- The raw content of the uploaded file.
563+        * ``content-length`` -- The length of the content in bytes.
564 
565+    If streaming file uploads are enabled two additional keys
566+    describing the uploaded file will be present:
567+
568+       * ``tmpfilename`` -- The filename for the temporary file.
569+       * ``tmpfile`` -- An open file object for the temporary file.
570+
571+    The temporary file will be removed when the request finishes.
572+
573+    Note that accessing ``content`` when streaming uploads are enabled
574+    will read the whole file into memory which may not be what you want.
575+
576     Note that ``FILES`` will only contain data if the request method was POST
577     and the ``<form>`` that posted to the request had
578     ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank
579Index: docs/settings.txt
580===================================================================
581--- docs/settings.txt   (revision 4459)
582+++ docs/settings.txt   (working copy)
583@@ -409,6 +409,15 @@
584 or ``django.core.mail.mail_managers``. You'll probably want to include the
585 trailing space.
586 
587+FILE_UPLOAD_DIR
588+---------------
589+
590+Default: Not defined
591+
592+Path to a directory where temporary files should be written during
593+file uploads. Leaving this unset will read files into memory.
594+
595+
596 IGNORABLE_404_ENDS
597 ------------------
598 
599Index: docs/forms.txt
600===================================================================
601--- docs/forms.txt      (revision 4459)
602+++ docs/forms.txt      (working copy)
603@@ -454,6 +454,19 @@
604    new_data = request.POST.copy()
605    new_data.update(request.FILES)
606 
607+Streaming file uploads.
608+-----------------------
609+
610+File uploads will be read into memory by default. This works fine for
611+small to medium sized uploads (from 1MB to to 100MB depending on your
612+setup and usage). If you want to support larger uploads you can enable
613+upload streaming where only a small part of the file will be in memory
614+at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
615+setting (see the settings_ document for more details).
616+
617+See `request object`_ for more details about ``request.FILES`` objects
618+with streaming file uploads enabled.
619+
620 Validators
621 ==========
622 
623@@ -668,3 +681,4 @@
624 .. _`generic views`: ../generic_views/
625 .. _`models API`: ../model_api/
626 .. _settings: ../settings/
627+.. _request object: ../request_response/#httprequest-objects