| | 1 | "streaming upload middleware" |
| | 2 | import cgi |
| | 3 | import os |
| | 4 | import tempfile |
| | 5 | import re |
| | 6 | import email, email.Message, email.FeedParser |
| | 7 | from email import Errors |
| | 8 | from email import Message |
| | 9 | from email.FeedParser import NLCRE, NLCRE_bol, NLCRE_eol, NLCRE_crack,headerRE, EMPTYSTRING, NL, NeedMoreData |
| | 10 | |
| | 11 | from django.conf import settings |
| | 12 | from django.utils.datastructures import MultiValueDict |
| | 13 | from django.utils import simplejson |
| | 14 | |
| | 15 | from django.core import signals |
| | 16 | from django.dispatch import dispatcher |
| | 17 | |
| | 18 | def delete_tempfile(sender): |
| | 19 | sender.delete() |
| | 20 | |
| | 21 | class TempFileDict(dict): |
| | 22 | "Keeps uploaded file as a file-like object and reads its content on demand" |
| | 23 | |
| | 24 | def __init__(self): |
| | 25 | self['name'] = tempfile.mktemp() |
| | 26 | self['file'] = open(self['name'], "w+b") |
| | 27 | dispatcher.connect(delete_tempfile, sender=self, signal=signals.request_finished) |
| | 28 | |
| | 29 | def write(self, data): |
| | 30 | self['file'].write(data) |
| | 31 | |
| | 32 | def close(self): |
| | 33 | self['file'].close() |
| | 34 | |
| | 35 | def delete(self): |
| | 36 | if os.path.exsists(self['name']): |
| | 37 | os.remove(self['name']) |
| | 38 | |
| | 39 | def tell(self): |
| | 40 | return os.path.getsize(self['name']) |
| | 41 | |
| | 42 | def __repr__(self): |
| | 43 | return '<TempFileDict>' |
| | 44 | |
| | 45 | class FileDict(dict): |
| | 46 | |
| | 47 | def __init__(self, filename, contenttype, payload): |
| | 48 | self['filename'] , self['content-type'], self['payload'] = filename, contenttype, payload |
| | 49 | |
| | 50 | def __getitem__(self, name): |
| | 51 | if name=='content' and not 'content' in self: |
| | 52 | size = self.tell() |
| | 53 | f = open(self['payload']['name'], 'r') |
| | 54 | self['content'] = f.read(size) |
| | 55 | f.close() |
| | 56 | return dict.__getitem__(self, name) |
| | 57 | |
| | 58 | def tell(self): |
| | 59 | return self['payload'].tell() |
| | 60 | |
| | 61 | def __repr__(self): |
| | 62 | return '<FileDict>' |
| | 63 | |
| | 64 | |
| | 65 | class StreamingFileFeedParser(email.FeedParser.FeedParser): |
| | 66 | |
| | 67 | def _parsegen(self): |
| | 68 | # Create a new message and start by parsing headers. |
| | 69 | self._new_message() |
| | 70 | headers = [] |
| | 71 | # Collect the headers, searching for a line that doesn't match the RFC |
| | 72 | # 2822 header or continuation pattern (including an empty line). |
| | 73 | for line in self._input: |
| | 74 | if line is NeedMoreData: |
| | 75 | yield NeedMoreData |
| | 76 | continue |
| | 77 | if not headerRE.match(line): |
| | 78 | # If we saw the RFC defined header/body separator |
| | 79 | # (i.e. newline), just throw it away. Otherwise the line is |
| | 80 | # part of the body so push it back. |
| | 81 | if not NLCRE.match(line): |
| | 82 | self._input.unreadline(line) |
| | 83 | break |
| | 84 | headers.append(line) |
| | 85 | # Done with the headers, so parse them and figure out what we're |
| | 86 | # supposed to see in the body of the message. |
| | 87 | self._parse_headers(headers) |
| | 88 | # Headers-only parsing is a backwards compatibility hack, which was |
| | 89 | # necessary in the older parser, which could throw errors. All |
| | 90 | # remaining lines in the input are thrown into the message body. |
| | 91 | if self._headersonly: |
| | 92 | lines = [] |
| | 93 | while True: |
| | 94 | line = self._input.readline() |
| | 95 | if line is NeedMoreData: |
| | 96 | yield NeedMoreData |
| | 97 | continue |
| | 98 | if line == '': |
| | 99 | break |
| | 100 | lines.append(line) |
| | 101 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| | 102 | return |
| | 103 | if self._cur.get_content_type() == 'message/delivery-status': |
| | 104 | # message/delivery-status contains blocks of headers separated by |
| | 105 | # a blank line. We'll represent each header block as a separate |
| | 106 | # nested message object, but the processing is a bit different |
| | 107 | # than standard message/* types because there is no body for the |
| | 108 | # nested messages. A blank line separates the subparts. |
| | 109 | while True: |
| | 110 | self._input.push_eof_matcher(NLCRE.match) |
| | 111 | for retval in self._parsegen(): |
| | 112 | if retval is NeedMoreData: |
| | 113 | yield NeedMoreData |
| | 114 | continue |
| | 115 | break |
| | 116 | msg = self._pop_message() |
| | 117 | # We need to pop the EOF matcher in order to tell if we're at |
| | 118 | # the end of the current file, not the end of the last block |
| | 119 | # of message headers. |
| | 120 | self._input.pop_eof_matcher() |
| | 121 | # The input stream must be sitting at the newline or at the |
| | 122 | # EOF. We want to see if we're at the end of this subpart, so |
| | 123 | # first consume the blank line, then test the next line to see |
| | 124 | # if we're at this subpart's EOF. |
| | 125 | while True: |
| | 126 | line = self._input.readline() |
| | 127 | if line is NeedMoreData: |
| | 128 | yield NeedMoreData |
| | 129 | continue |
| | 130 | break |
| | 131 | while True: |
| | 132 | line = self._input.readline() |
| | 133 | if line is NeedMoreData: |
| | 134 | yield NeedMoreData |
| | 135 | continue |
| | 136 | break |
| | 137 | if line == '': |
| | 138 | break |
| | 139 | # Not at EOF so this is a line we're going to need. |
| | 140 | self._input.unreadline(line) |
| | 141 | return |
| | 142 | if self._cur.get_content_maintype() == 'message': |
| | 143 | # The message claims to be a message/* type, then what follows is |
| | 144 | # another RFC 2822 message. |
| | 145 | for retval in self._parsegen(): |
| | 146 | if retval is NeedMoreData: |
| | 147 | yield NeedMoreData |
| | 148 | continue |
| | 149 | break |
| | 150 | self._pop_message() |
| | 151 | return |
| | 152 | if self._cur.get_content_maintype() == 'multipart': |
| | 153 | boundary = self._cur.get_boundary() |
| | 154 | if boundary is None: |
| | 155 | # The message /claims/ to be a multipart but it has not |
| | 156 | # defined a boundary. That's a problem which we'll handle by |
| | 157 | # reading everything until the EOF and marking the message as |
| | 158 | # defective. |
| | 159 | self._cur.defects.append(Errors.NoBoundaryInMultipartDefect()) |
| | 160 | lines = [] |
| | 161 | for line in self._input: |
| | 162 | if line is NeedMoreData: |
| | 163 | yield NeedMoreData |
| | 164 | continue |
| | 165 | lines.append(line) |
| | 166 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| | 167 | return |
| | 168 | # Create a line match predicate which matches the inter-part |
| | 169 | # boundary as well as the end-of-multipart boundary. Don't push |
| | 170 | # this onto the input stream until we've scanned past the |
| | 171 | # preamble. |
| | 172 | separator = '--' + boundary |
| | 173 | boundaryre = re.compile( |
| | 174 | '(?P<sep>' + re.escape(separator) + |
| | 175 | r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') |
| | 176 | capturing_preamble = True |
| | 177 | preamble = [] |
| | 178 | linesep = False |
| | 179 | while True: |
| | 180 | line = self._input.readline() |
| | 181 | if line is NeedMoreData: |
| | 182 | yield NeedMoreData |
| | 183 | continue |
| | 184 | if line == '': |
| | 185 | break |
| | 186 | mo = boundaryre.match(line) |
| | 187 | if mo: |
| | 188 | # If we're looking at the end boundary, we're done with |
| | 189 | # this multipart. If there was a newline at the end of |
| | 190 | # the closing boundary, then we need to initialize the |
| | 191 | # epilogue with the empty string (see below). |
| | 192 | if mo.group('end'): |
| | 193 | linesep = mo.group('linesep') |
| | 194 | break |
| | 195 | # We saw an inter-part boundary. Were we in the preamble? |
| | 196 | if capturing_preamble: |
| | 197 | if preamble: |
| | 198 | # According to RFC 2046, the last newline belongs |
| | 199 | # to the boundary. |
| | 200 | lastline = preamble[-1] |
| | 201 | eolmo = NLCRE_eol.search(lastline) |
| | 202 | if eolmo: |
| | 203 | preamble[-1] = lastline[:-len(eolmo.group(0))] |
| | 204 | self._cur.preamble = EMPTYSTRING.join(preamble) |
| | 205 | capturing_preamble = False |
| | 206 | self._input.unreadline(line) |
| | 207 | continue |
| | 208 | # We saw a boundary separating two parts. Consume any |
| | 209 | # multiple boundary lines that may be following. Our |
| | 210 | # interpretation of RFC 2046 BNF grammar does not produce |
| | 211 | # body parts within such double boundaries. |
| | 212 | while True: |
| | 213 | line = self._input.readline() |
| | 214 | if line is NeedMoreData: |
| | 215 | yield NeedMoreData |
| | 216 | continue |
| | 217 | mo = boundaryre.match(line) |
| | 218 | if not mo: |
| | 219 | self._input.unreadline(line) |
| | 220 | break |
| | 221 | # Recurse to parse this subpart; the input stream points |
| | 222 | # at the subpart's first line. |
| | 223 | self._input.push_eof_matcher(boundaryre.match) |
| | 224 | for retval in self._parsegen(): |
| | 225 | if retval is NeedMoreData: |
| | 226 | yield NeedMoreData |
| | 227 | continue |
| | 228 | break |
| | 229 | # Because of RFC 2046, the newline preceding the boundary |
| | 230 | # separator actually belongs to the boundary, not the |
| | 231 | # previous subpart's payload (or epilogue if the previous |
| | 232 | # part is a multipart). |
| | 233 | if self._last.get_content_maintype() == 'multipart': |
| | 234 | epilogue = self._last.epilogue |
| | 235 | if epilogue == '': |
| | 236 | self._last.epilogue = None |
| | 237 | elif epilogue is not None: |
| | 238 | mo = NLCRE_eol.search(epilogue) |
| | 239 | if mo: |
| | 240 | end = len(mo.group(0)) |
| | 241 | self._last.epilogue = epilogue[:-end] |
| | 242 | else: |
| | 243 | payload = self._last.get_payload() |
| | 244 | if isinstance(payload, basestring): |
| | 245 | mo = NLCRE_eol.search(payload) |
| | 246 | if mo: |
| | 247 | payload = payload[:-len(mo.group(0))] |
| | 248 | self._last.set_payload(payload) |
| | 249 | self._input.pop_eof_matcher() |
| | 250 | self._pop_message() |
| | 251 | # Set the multipart up for newline cleansing, which will |
| | 252 | # happen if we're in a nested multipart. |
| | 253 | self._last = self._cur |
| | 254 | else: |
| | 255 | # I think we must be in the preamble |
| | 256 | assert capturing_preamble |
| | 257 | preamble.append(line) |
| | 258 | # We've seen either the EOF or the end boundary. If we're still |
| | 259 | # capturing the preamble, we never saw the start boundary. Note |
| | 260 | # that as a defect and store the captured text as the payload. |
| | 261 | # Everything from here to the EOF is epilogue. |
| | 262 | if capturing_preamble: |
| | 263 | self._cur.defects.append(Errors.StartBoundaryNotFoundDefect()) |
| | 264 | self._cur.set_payload(EMPTYSTRING.join(preamble)) |
| | 265 | epilogue = [] |
| | 266 | for line in self._input: |
| | 267 | if line is NeedMoreData: |
| | 268 | yield NeedMoreData |
| | 269 | continue |
| | 270 | self._cur.epilogue = EMPTYSTRING.join(epilogue) |
| | 271 | return |
| | 272 | # If the end boundary ended in a newline, we'll need to make sure |
| | 273 | # the epilogue isn't None |
| | 274 | if linesep: |
| | 275 | epilogue = [''] |
| | 276 | else: |
| | 277 | epilogue = [] |
| | 278 | for line in self._input: |
| | 279 | if line is NeedMoreData: |
| | 280 | yield NeedMoreData |
| | 281 | continue |
| | 282 | epilogue.append(line) |
| | 283 | # Any CRLF at the front of the epilogue is not technically part of |
| | 284 | # the epilogue. Also, watch out for an empty string epilogue, |
| | 285 | # which means a single newline. |
| | 286 | if epilogue: |
| | 287 | firstline = epilogue[0] |
| | 288 | bolmo = NLCRE_bol.match(firstline) |
| | 289 | if bolmo: |
| | 290 | epilogue[0] = firstline[len(bolmo.group(0)):] |
| | 291 | self._cur.epilogue = EMPTYSTRING.join(epilogue) |
| | 292 | return |
| | 293 | # Otherwise, it's some non-multipart type, so the entire rest of the |
| | 294 | # file contents becomes the payload. |
| | 295 | name_dict = cgi.parse_header(self._cur['Content-Disposition'])[1] |
| | 296 | if name_dict.has_key('filename'): |
| | 297 | tmpfile = TempFileDict() |
| | 298 | for line in self._input: |
| | 299 | if line is NeedMoreData: |
| | 300 | yield NeedMoreData |
| | 301 | continue |
| | 302 | tmpfile.write(line) |
| | 303 | tmpfile.close() |
| | 304 | self._cur.set_payload(tmpfile) |
| | 305 | else: |
| | 306 | lines = [] |
| | 307 | for line in self._input: |
| | 308 | if line is NeedMoreData: |
| | 309 | yield NeedMoreData |
| | 310 | continue |
| | 311 | lines.append(line) |
| | 312 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| | 313 | |
| | 314 | def parse_streaming_file_upload(req): |
| | 315 | "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)" |
| | 316 | |
| | 317 | try: |
| | 318 | BUFFER_SIZE=settings.UPLOAD_BUFFER_SIZE |
| | 319 | except: |
| | 320 | BUFFER_SIZE=200000 |
| | 321 | |
| | 322 | if hasattr(req, 'upload_state'): |
| | 323 | upload_state = req.upload_state(req) |
| | 324 | else: |
| | 325 | upload_state = None |
| | 326 | |
| | 327 | raw_headers = '\r\n'.join(['%s:%s' % pair for pair in req.header_dict.items()]) |
| | 328 | raw_headers += '\r\n\r\n' |
| | 329 | POST = MultiValueDict() |
| | 330 | FILES = MultiValueDict() |
| | 331 | parser = StreamingFileFeedParser() |
| | 332 | parser.feed(raw_headers) |
| | 333 | while 1: |
| | 334 | # make this a non-blocing read |
| | 335 | line=req.raw_request.read(BUFFER_SIZE) |
| | 336 | if upload_state: |
| | 337 | upload_state.addlen(len(line)) |
| | 338 | parser.feed(line) |
| | 339 | if line == '': |
| | 340 | break |
| | 341 | msg=parser.close() |
| | 342 | POST = MultiValueDict() |
| | 343 | FILES = MultiValueDict() |
| | 344 | for submessage in msg.get_payload(): |
| | 345 | if isinstance(submessage, email.Message.Message): |
| | 346 | name_dict = cgi.parse_header(submessage['Content-Disposition'])[1] |
| | 347 | # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads |
| | 348 | # or {'name': 'blah'} for POST fields |
| | 349 | # We assume all uploaded files have a 'filename' set. |
| | 350 | if name_dict.has_key('filename'): |
| | 351 | assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported" |
| | 352 | if not name_dict['filename'].strip(): |
| | 353 | continue |
| | 354 | # IE submits the full path, so trim everything but the basename. |
| | 355 | # (We can't use os.path.basename because it expects Linux paths.) |
| | 356 | filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:] |
| | 357 | FILES.appendlist(name_dict['name'], FileDict( |
| | 358 | filename, |
| | 359 | (submessage.has_key('Content-Type') and submessage['Content-Type'] or None), |
| | 360 | submessage.get_payload() |
| | 361 | )) |
| | 362 | else: |
| | 363 | POST.appendlist(name_dict['name'], submessage.get_payload()) |
| | 364 | return POST, FILES |
| | 365 | |
| | 366 | class StreamingUploadMiddleware: |
| | 367 | |
| | 368 | def process_request(self, request): |
| | 369 | request.parse_file_upload = parse_streaming_file_upload |
| | 370 | |
| | 371 | def get_temp_file(identifier): |
| | 372 | return os.path.join(tempfile.gettempdir(),identifier) |
| | 373 | |
| | 374 | class UploadState: |
| | 375 | |
| | 376 | def __init__(self, req): |
| | 377 | self.identifier = req.META['QUERY_STRING'] |
| | 378 | self.state = {'size': int(req.header_dict.get('content-length')), |
| | 379 | 'state': 'starting', 'received': 0} |
| | 380 | self.save() |
| | 381 | |
| | 382 | def addlen(self, toadd): |
| | 383 | self.state['received'] = self.state['received'] + toadd |
| | 384 | if self.state['size']-1 <= self.state['received']: |
| | 385 | self.state['state'] = 'done' |
| | 386 | else: |
| | 387 | self.state['state'] = 'uploading' |
| | 388 | self.save() |
| | 389 | |
| | 390 | def save(self): |
| | 391 | simplejson.dump(self.state,open(get_temp_file(self.identifier), 'w')) |
| | 392 | |
| | 393 | class UploadStateMiddleware: |
| | 394 | |
| | 395 | def process_request(self, request): |
| | 396 | |
| | 397 | try: |
| | 398 | progress_url=settings.PROGRESS_URL |
| | 399 | except: |
| | 400 | progress_url='/progress/' |
| | 401 | |
| | 402 | if request.META['QUERY_STRING']: |
| | 403 | request.upload_state = UploadState |
| | 404 | |
| | 405 | if request.path == progress_url: |
| | 406 | progress_id = request.progress_id |
| | 407 | |
| | 408 | try: |
| | 409 | content = open(get_temp_file(progress_id), 'r').read() |
| | 410 | except: |
| | 411 | content="{}" |
| | 412 | if not content: |
| | 413 | content="{}" |
| | 414 | |
| | 415 | from django.http import HttpResponse |
| | 416 | return HttpResponse(content=content, mimetype='text/plain') |