| 1 | "streaming upload middleware" |
| 2 | import cgi |
| 3 | import os |
| 4 | import tempfile |
| 5 | import re |
| 6 | import email, email.Message, email.FeedParser |
| 7 | from email import Errors |
| 8 | from email import Message |
| 9 | from email.FeedParser import NLCRE, NLCRE_bol, NLCRE_eol, NLCRE_crack,headerRE, EMPTYSTRING, NL, NeedMoreData |
| 10 | |
| 11 | from django.conf import settings |
| 12 | from django.utils.datastructures import MultiValueDict |
| 13 | from django.utils import simplejson |
| 14 | |
| 15 | from django.core import signals |
| 16 | from django.dispatch import dispatcher |
| 17 | |
| 18 | def delete_tempfile(sender): |
| 19 | sender.delete() |
| 20 | |
| 21 | class TempFileDict(dict): |
| 22 | "Keeps uploaded file as a file-like object and reads its content on demand" |
| 23 | |
| 24 | def __init__(self): |
| 25 | self['name'] = tempfile.mktemp() |
| 26 | self['file'] = open(self['name'], "w+b") |
| 27 | dispatcher.connect(delete_tempfile, sender=self, signal=signals.request_finished) |
| 28 | |
| 29 | def write(self, data): |
| 30 | self['file'].write(data) |
| 31 | |
| 32 | def close(self): |
| 33 | self['file'].close() |
| 34 | |
| 35 | def delete(self): |
| 36 | if os.path.exsists(self['name']): |
| 37 | os.remove(self['name']) |
| 38 | |
| 39 | def tell(self): |
| 40 | return os.path.getsize(self['name']) |
| 41 | |
| 42 | def __repr__(self): |
| 43 | return '<TempFileDict>' |
| 44 | |
| 45 | class FileDict(dict): |
| 46 | |
| 47 | def __init__(self, filename, contenttype, payload): |
| 48 | self['filename'] , self['content-type'], self['payload'] = filename, contenttype, payload |
| 49 | |
| 50 | def __getitem__(self, name): |
| 51 | if name=='content' and not 'content' in self: |
| 52 | size = self.tell() |
| 53 | f = open(self['payload']['name'], 'r') |
| 54 | self['content'] = f.read(size) |
| 55 | f.close() |
| 56 | return dict.__getitem__(self, name) |
| 57 | |
| 58 | def tell(self): |
| 59 | return self['payload'].tell() |
| 60 | |
| 61 | def __repr__(self): |
| 62 | return '<FileDict>' |
| 63 | |
| 64 | |
| 65 | class StreamingFileFeedParser(email.FeedParser.FeedParser): |
| 66 | |
| 67 | def _parsegen(self): |
| 68 | # Create a new message and start by parsing headers. |
| 69 | self._new_message() |
| 70 | headers = [] |
| 71 | # Collect the headers, searching for a line that doesn't match the RFC |
| 72 | # 2822 header or continuation pattern (including an empty line). |
| 73 | for line in self._input: |
| 74 | if line is NeedMoreData: |
| 75 | yield NeedMoreData |
| 76 | continue |
| 77 | if not headerRE.match(line): |
| 78 | # If we saw the RFC defined header/body separator |
| 79 | # (i.e. newline), just throw it away. Otherwise the line is |
| 80 | # part of the body so push it back. |
| 81 | if not NLCRE.match(line): |
| 82 | self._input.unreadline(line) |
| 83 | break |
| 84 | headers.append(line) |
| 85 | # Done with the headers, so parse them and figure out what we're |
| 86 | # supposed to see in the body of the message. |
| 87 | self._parse_headers(headers) |
| 88 | # Headers-only parsing is a backwards compatibility hack, which was |
| 89 | # necessary in the older parser, which could throw errors. All |
| 90 | # remaining lines in the input are thrown into the message body. |
| 91 | if self._headersonly: |
| 92 | lines = [] |
| 93 | while True: |
| 94 | line = self._input.readline() |
| 95 | if line is NeedMoreData: |
| 96 | yield NeedMoreData |
| 97 | continue |
| 98 | if line == '': |
| 99 | break |
| 100 | lines.append(line) |
| 101 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| 102 | return |
| 103 | if self._cur.get_content_type() == 'message/delivery-status': |
| 104 | # message/delivery-status contains blocks of headers separated by |
| 105 | # a blank line. We'll represent each header block as a separate |
| 106 | # nested message object, but the processing is a bit different |
| 107 | # than standard message/* types because there is no body for the |
| 108 | # nested messages. A blank line separates the subparts. |
| 109 | while True: |
| 110 | self._input.push_eof_matcher(NLCRE.match) |
| 111 | for retval in self._parsegen(): |
| 112 | if retval is NeedMoreData: |
| 113 | yield NeedMoreData |
| 114 | continue |
| 115 | break |
| 116 | msg = self._pop_message() |
| 117 | # We need to pop the EOF matcher in order to tell if we're at |
| 118 | # the end of the current file, not the end of the last block |
| 119 | # of message headers. |
| 120 | self._input.pop_eof_matcher() |
| 121 | # The input stream must be sitting at the newline or at the |
| 122 | # EOF. We want to see if we're at the end of this subpart, so |
| 123 | # first consume the blank line, then test the next line to see |
| 124 | # if we're at this subpart's EOF. |
| 125 | while True: |
| 126 | line = self._input.readline() |
| 127 | if line is NeedMoreData: |
| 128 | yield NeedMoreData |
| 129 | continue |
| 130 | break |
| 131 | while True: |
| 132 | line = self._input.readline() |
| 133 | if line is NeedMoreData: |
| 134 | yield NeedMoreData |
| 135 | continue |
| 136 | break |
| 137 | if line == '': |
| 138 | break |
| 139 | # Not at EOF so this is a line we're going to need. |
| 140 | self._input.unreadline(line) |
| 141 | return |
| 142 | if self._cur.get_content_maintype() == 'message': |
| 143 | # The message claims to be a message/* type, then what follows is |
| 144 | # another RFC 2822 message. |
| 145 | for retval in self._parsegen(): |
| 146 | if retval is NeedMoreData: |
| 147 | yield NeedMoreData |
| 148 | continue |
| 149 | break |
| 150 | self._pop_message() |
| 151 | return |
| 152 | if self._cur.get_content_maintype() == 'multipart': |
| 153 | boundary = self._cur.get_boundary() |
| 154 | if boundary is None: |
| 155 | # The message /claims/ to be a multipart but it has not |
| 156 | # defined a boundary. That's a problem which we'll handle by |
| 157 | # reading everything until the EOF and marking the message as |
| 158 | # defective. |
| 159 | self._cur.defects.append(Errors.NoBoundaryInMultipartDefect()) |
| 160 | lines = [] |
| 161 | for line in self._input: |
| 162 | if line is NeedMoreData: |
| 163 | yield NeedMoreData |
| 164 | continue |
| 165 | lines.append(line) |
| 166 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| 167 | return |
| 168 | # Create a line match predicate which matches the inter-part |
| 169 | # boundary as well as the end-of-multipart boundary. Don't push |
| 170 | # this onto the input stream until we've scanned past the |
| 171 | # preamble. |
| 172 | separator = '--' + boundary |
| 173 | boundaryre = re.compile( |
| 174 | '(?P<sep>' + re.escape(separator) + |
| 175 | r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') |
| 176 | capturing_preamble = True |
| 177 | preamble = [] |
| 178 | linesep = False |
| 179 | while True: |
| 180 | line = self._input.readline() |
| 181 | if line is NeedMoreData: |
| 182 | yield NeedMoreData |
| 183 | continue |
| 184 | if line == '': |
| 185 | break |
| 186 | mo = boundaryre.match(line) |
| 187 | if mo: |
| 188 | # If we're looking at the end boundary, we're done with |
| 189 | # this multipart. If there was a newline at the end of |
| 190 | # the closing boundary, then we need to initialize the |
| 191 | # epilogue with the empty string (see below). |
| 192 | if mo.group('end'): |
| 193 | linesep = mo.group('linesep') |
| 194 | break |
| 195 | # We saw an inter-part boundary. Were we in the preamble? |
| 196 | if capturing_preamble: |
| 197 | if preamble: |
| 198 | # According to RFC 2046, the last newline belongs |
| 199 | # to the boundary. |
| 200 | lastline = preamble[-1] |
| 201 | eolmo = NLCRE_eol.search(lastline) |
| 202 | if eolmo: |
| 203 | preamble[-1] = lastline[:-len(eolmo.group(0))] |
| 204 | self._cur.preamble = EMPTYSTRING.join(preamble) |
| 205 | capturing_preamble = False |
| 206 | self._input.unreadline(line) |
| 207 | continue |
| 208 | # We saw a boundary separating two parts. Consume any |
| 209 | # multiple boundary lines that may be following. Our |
| 210 | # interpretation of RFC 2046 BNF grammar does not produce |
| 211 | # body parts within such double boundaries. |
| 212 | while True: |
| 213 | line = self._input.readline() |
| 214 | if line is NeedMoreData: |
| 215 | yield NeedMoreData |
| 216 | continue |
| 217 | mo = boundaryre.match(line) |
| 218 | if not mo: |
| 219 | self._input.unreadline(line) |
| 220 | break |
| 221 | # Recurse to parse this subpart; the input stream points |
| 222 | # at the subpart's first line. |
| 223 | self._input.push_eof_matcher(boundaryre.match) |
| 224 | for retval in self._parsegen(): |
| 225 | if retval is NeedMoreData: |
| 226 | yield NeedMoreData |
| 227 | continue |
| 228 | break |
| 229 | # Because of RFC 2046, the newline preceding the boundary |
| 230 | # separator actually belongs to the boundary, not the |
| 231 | # previous subpart's payload (or epilogue if the previous |
| 232 | # part is a multipart). |
| 233 | if self._last.get_content_maintype() == 'multipart': |
| 234 | epilogue = self._last.epilogue |
| 235 | if epilogue == '': |
| 236 | self._last.epilogue = None |
| 237 | elif epilogue is not None: |
| 238 | mo = NLCRE_eol.search(epilogue) |
| 239 | if mo: |
| 240 | end = len(mo.group(0)) |
| 241 | self._last.epilogue = epilogue[:-end] |
| 242 | else: |
| 243 | payload = self._last.get_payload() |
| 244 | if isinstance(payload, basestring): |
| 245 | mo = NLCRE_eol.search(payload) |
| 246 | if mo: |
| 247 | payload = payload[:-len(mo.group(0))] |
| 248 | self._last.set_payload(payload) |
| 249 | self._input.pop_eof_matcher() |
| 250 | self._pop_message() |
| 251 | # Set the multipart up for newline cleansing, which will |
| 252 | # happen if we're in a nested multipart. |
| 253 | self._last = self._cur |
| 254 | else: |
| 255 | # I think we must be in the preamble |
| 256 | assert capturing_preamble |
| 257 | preamble.append(line) |
| 258 | # We've seen either the EOF or the end boundary. If we're still |
| 259 | # capturing the preamble, we never saw the start boundary. Note |
| 260 | # that as a defect and store the captured text as the payload. |
| 261 | # Everything from here to the EOF is epilogue. |
| 262 | if capturing_preamble: |
| 263 | self._cur.defects.append(Errors.StartBoundaryNotFoundDefect()) |
| 264 | self._cur.set_payload(EMPTYSTRING.join(preamble)) |
| 265 | epilogue = [] |
| 266 | for line in self._input: |
| 267 | if line is NeedMoreData: |
| 268 | yield NeedMoreData |
| 269 | continue |
| 270 | self._cur.epilogue = EMPTYSTRING.join(epilogue) |
| 271 | return |
| 272 | # If the end boundary ended in a newline, we'll need to make sure |
| 273 | # the epilogue isn't None |
| 274 | if linesep: |
| 275 | epilogue = [''] |
| 276 | else: |
| 277 | epilogue = [] |
| 278 | for line in self._input: |
| 279 | if line is NeedMoreData: |
| 280 | yield NeedMoreData |
| 281 | continue |
| 282 | epilogue.append(line) |
| 283 | # Any CRLF at the front of the epilogue is not technically part of |
| 284 | # the epilogue. Also, watch out for an empty string epilogue, |
| 285 | # which means a single newline. |
| 286 | if epilogue: |
| 287 | firstline = epilogue[0] |
| 288 | bolmo = NLCRE_bol.match(firstline) |
| 289 | if bolmo: |
| 290 | epilogue[0] = firstline[len(bolmo.group(0)):] |
| 291 | self._cur.epilogue = EMPTYSTRING.join(epilogue) |
| 292 | return |
| 293 | # Otherwise, it's some non-multipart type, so the entire rest of the |
| 294 | # file contents becomes the payload. |
| 295 | name_dict = cgi.parse_header(self._cur['Content-Disposition'])[1] |
| 296 | if name_dict.has_key('filename'): |
| 297 | tmpfile = TempFileDict() |
| 298 | for line in self._input: |
| 299 | if line is NeedMoreData: |
| 300 | yield NeedMoreData |
| 301 | continue |
| 302 | tmpfile.write(line) |
| 303 | tmpfile.close() |
| 304 | self._cur.set_payload(tmpfile) |
| 305 | else: |
| 306 | lines = [] |
| 307 | for line in self._input: |
| 308 | if line is NeedMoreData: |
| 309 | yield NeedMoreData |
| 310 | continue |
| 311 | lines.append(line) |
| 312 | self._cur.set_payload(EMPTYSTRING.join(lines)) |
| 313 | |
| 314 | def parse_streaming_file_upload(req): |
| 315 | "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)" |
| 316 | |
| 317 | try: |
| 318 | BUFFER_SIZE=settings.UPLOAD_BUFFER_SIZE |
| 319 | except: |
| 320 | BUFFER_SIZE=200000 |
| 321 | |
| 322 | if hasattr(req, 'upload_state'): |
| 323 | upload_state = req.upload_state(req) |
| 324 | else: |
| 325 | upload_state = None |
| 326 | |
| 327 | raw_headers = '\r\n'.join(['%s:%s' % pair for pair in req.header_dict.items()]) |
| 328 | raw_headers += '\r\n\r\n' |
| 329 | POST = MultiValueDict() |
| 330 | FILES = MultiValueDict() |
| 331 | parser = StreamingFileFeedParser() |
| 332 | parser.feed(raw_headers) |
| 333 | while 1: |
| 334 | # make this a non-blocing read |
| 335 | line=req.raw_request.read(BUFFER_SIZE) |
| 336 | if upload_state: |
| 337 | upload_state.addlen(len(line)) |
| 338 | parser.feed(line) |
| 339 | if line == '': |
| 340 | break |
| 341 | msg=parser.close() |
| 342 | POST = MultiValueDict() |
| 343 | FILES = MultiValueDict() |
| 344 | for submessage in msg.get_payload(): |
| 345 | if isinstance(submessage, email.Message.Message): |
| 346 | name_dict = cgi.parse_header(submessage['Content-Disposition'])[1] |
| 347 | # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads |
| 348 | # or {'name': 'blah'} for POST fields |
| 349 | # We assume all uploaded files have a 'filename' set. |
| 350 | if name_dict.has_key('filename'): |
| 351 | assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported" |
| 352 | if not name_dict['filename'].strip(): |
| 353 | continue |
| 354 | # IE submits the full path, so trim everything but the basename. |
| 355 | # (We can't use os.path.basename because it expects Linux paths.) |
| 356 | filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:] |
| 357 | FILES.appendlist(name_dict['name'], FileDict( |
| 358 | filename, |
| 359 | (submessage.has_key('Content-Type') and submessage['Content-Type'] or None), |
| 360 | submessage.get_payload() |
| 361 | )) |
| 362 | else: |
| 363 | POST.appendlist(name_dict['name'], submessage.get_payload()) |
| 364 | return POST, FILES |
| 365 | |
| 366 | class StreamingUploadMiddleware: |
| 367 | |
| 368 | def process_request(self, request): |
| 369 | request.parse_file_upload = parse_streaming_file_upload |
| 370 | |
| 371 | def get_temp_file(identifier): |
| 372 | return os.path.join(tempfile.gettempdir(),identifier) |
| 373 | |
| 374 | class UploadState: |
| 375 | |
| 376 | def __init__(self, req): |
| 377 | self.identifier = req.META['QUERY_STRING'] |
| 378 | self.state = {'size': int(req.header_dict.get('content-length')), |
| 379 | 'state': 'starting', 'received': 0} |
| 380 | self.save() |
| 381 | |
| 382 | def addlen(self, toadd): |
| 383 | self.state['received'] = self.state['received'] + toadd |
| 384 | if self.state['size']-1 <= self.state['received']: |
| 385 | self.state['state'] = 'done' |
| 386 | else: |
| 387 | self.state['state'] = 'uploading' |
| 388 | self.save() |
| 389 | |
| 390 | def save(self): |
| 391 | simplejson.dump(self.state,open(get_temp_file(self.identifier), 'w')) |
| 392 | |
| 393 | class UploadStateMiddleware: |
| 394 | |
| 395 | def process_request(self, request): |
| 396 | |
| 397 | try: |
| 398 | progress_url=settings.PROGRESS_URL |
| 399 | except: |
| 400 | progress_url='/progress/' |
| 401 | |
| 402 | if request.META['QUERY_STRING']: |
| 403 | request.upload_state = UploadState |
| 404 | |
| 405 | if request.path == progress_url: |
| 406 | progress_id = request.progress_id |
| 407 | |
| 408 | try: |
| 409 | content = open(get_temp_file(progress_id), 'r').read() |
| 410 | except: |
| 411 | content="{}" |
| 412 | if not content: |
| 413 | content="{}" |
| 414 | |
| 415 | from django.http import HttpResponse |
| 416 | return HttpResponse(content=content, mimetype='text/plain') |