diff --git a/django/utils/html.py b/django/utils/html.py
index 650e848..21992dc 100644
a
|
b
|
from django.utils.functional import allow_lazy
|
16 | 16 | from django.utils import six |
17 | 17 | from django.utils.text import normalize_newlines |
18 | 18 | |
| 19 | from .html_parser import HTMLParser |
| 20 | |
| 21 | |
19 | 22 | # Configuration for urlize() function. |
20 | 23 | TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] |
21 | 24 | WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] |
… |
… |
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
33 | 36 | html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) |
34 | 37 | hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) |
35 | 38 | trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') |
36 | | strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) |
37 | 39 | |
38 | 40 | |
39 | 41 | def escape(text): |
… |
… |
def linebreaks(value, autoescape=False):
|
116 | 118 | return '\n\n'.join(paras) |
117 | 119 | linebreaks = allow_lazy(linebreaks, six.text_type) |
118 | 120 | |
| 121 | |
| 122 | class MLStripper(HTMLParser): |
| 123 | def __init__(self): |
| 124 | HTMLParser.__init__(self) |
| 125 | self.reset() |
| 126 | self.fed = [] |
| 127 | def handle_data(self, d): |
| 128 | self.fed.append(d) |
| 129 | def handle_entityref(self, name): |
| 130 | self.fed.append('&%s;' % name) |
| 131 | def handle_charref(self, name): |
| 132 | self.fed.append('&#%s;' % name) |
| 133 | def get_data(self): |
| 134 | return ''.join(self.fed) |
| 135 | |
119 | 136 | def strip_tags(value): |
120 | 137 | """Returns the given HTML with all tags stripped.""" |
121 | | return strip_tags_re.sub('', force_text(value)) |
| 138 | s = MLStripper() |
| 139 | s.feed(value) |
| 140 | return s.get_data() |
122 | 141 | strip_tags = allow_lazy(strip_tags) |
123 | 142 | |
124 | 143 | def remove_tags(html, tags): |
diff --git a/tests/utils_tests/html.py b/tests/utils_tests/html.py
index 090cc32..04c1569 100644
a
|
b
|
import os
|
5 | 5 | |
6 | 6 | from django.utils import html |
7 | 7 | from django.utils._os import upath |
| 8 | from django.utils.encoding import force_text |
8 | 9 | from django.utils.unittest import TestCase |
9 | 10 | |
10 | 11 | |
… |
… |
class TestUtilsHtml(TestCase):
|
63 | 64 | def test_strip_tags(self): |
64 | 65 | f = html.strip_tags |
65 | 66 | items = ( |
| 67 | ('<p>See: 'é is an apostrophe followed by e acute</p>', |
| 68 | 'See: 'é is an apostrophe followed by e acute'), |
66 | 69 | ('<adf>a', 'a'), |
67 | 70 | ('</adf>a', 'a'), |
68 | 71 | ('<asdf><asdf>e', 'e'), |
69 | | ('<f', '<f'), |
| 72 | ('<f x', '<f x'), |
70 | 73 | ('</fe', '</fe'), |
71 | 74 | ('<x>b<y>', 'b'), |
72 | 75 | ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), |
… |
… |
class TestUtilsHtml(TestCase):
|
81 | 84 | for filename in ('strip_tags1.html', 'strip_tags2.txt'): |
82 | 85 | path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) |
83 | 86 | with open(path, 'r') as fp: |
| 87 | content = force_text(fp.read()) |
84 | 88 | start = datetime.now() |
85 | | stripped = html.strip_tags(fp.read()) |
| 89 | stripped = html.strip_tags(content) |
86 | 90 | elapsed = datetime.now() - start |
87 | 91 | self.assertEqual(elapsed.seconds, 0) |
88 | 92 | self.assertIn("Please try again.", stripped) |