Context Navigation

Back to Ticket #19237

Ticket #19237: 19237-parser-3.diff

File 19237-parser-3.diff, 3.4 KB (added by Claude Paroz, 12 years ago)
HTMLParser: Don't swallow unclosed tags

django/utils/html.py

diff --git a/django/utils/html.py b/django/utils/html.py
index 650e848..86d581d 100644

                from django.utils.functional import allow_lazy
 from django.utils import six
 from django.utils.text import normalize_newlines
+from .html_parser import HTMLParser
 # Configuration for urlize() function.
 TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
 WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')]
-…
+               link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
 trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
-strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
 def escape(text):
-…
+               def linebreaks(value, autoescape=False):
     return '\n\n'.join(paras)
 linebreaks = allow_lazy(linebreaks, six.text_type)
+class MLStripper(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.reset()
+        self.fed = []
+    def handle_data(self, d):
+        self.fed.append(d)
+    def handle_entityref(self, name):
+        self.fed.append('&%s;' % name)
+    def handle_charref(self, name):
+        self.fed.append('&#%s;' % name)
+    def get_data(self):
+        return ''.join(self.fed)
 def strip_tags(value):
     """Returns the given HTML with all tags stripped."""
+    return strip_tags_re.sub('', force_text(value))
+    s = MLStripper()
+    s.feed(value)
+    data = s.get_data()
+    try:
+        res = s.close()
+    except Exception as e:
+        data += s.rawdata
+    return data
 strip_tags = allow_lazy(strip_tags)
 def remove_tags(html, tags):

tests/utils_tests/html.py

diff --git a/tests/utils_tests/html.py b/tests/utils_tests/html.py
index 090cc32..c3e9f7c 100644

                import os
 from django.utils import html
 from django.utils._os import upath
+from django.utils.encoding import force_text
 from django.utils.unittest import TestCase
-…
+               class TestUtilsHtml(TestCase):
     def test_strip_tags(self):
         f = html.strip_tags
         items = (
+            ('<p>See: &#39;&eacute; is an apostrophe followed by e acute</p>',
+             'See: &#39;&eacute; is an apostrophe followed by e acute'),
             ('<adf>a', 'a'),
             ('</adf>a', 'a'),
             ('<asdf><asdf>e', 'e'),
             ('<f', '<f'),
+            ('hi, <f x', 'hi, <f x'),
             ('</fe', '</fe'),
             ('<x>b<y>', 'b'),
             ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
-…
+               class TestUtilsHtml(TestCase):
         for filename in ('strip_tags1.html', 'strip_tags2.txt'):
             path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
             with open(path, 'r') as fp:
+                content = force_text(fp.read())
                 start = datetime.now()
                 stripped = html.strip_tags(fp.read())
+                stripped = html.strip_tags(content)
                 elapsed = datetime.now() - start
             self.assertEqual(elapsed.seconds, 0)
             self.assertIn("Please try again.", stripped)

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #19237: 19237-parser-3.diff

django/utils/html.py

tests/utils_tests/html.py

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us