Ticket #19237: 19237-parser-3.diff

File 19237-parser-3.diff, 3.4 KB (added by Claude Paroz, 12 years ago)

HTMLParser: Don't swallow unclosed tags

  • django/utils/html.py

    diff --git a/django/utils/html.py b/django/utils/html.py
    index 650e848..86d581d 100644
    a b from django.utils.functional import allow_lazy  
    1616from django.utils import six
    1717from django.utils.text import normalize_newlines
    1818
     19from .html_parser import HTMLParser
     20
     21
    1922# Configuration for urlize() function.
    2023TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
    2124WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')]
    link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')  
    3336html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
    3437hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
    3538trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
    36 strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
    3739
    3840
    3941def escape(text):
    def linebreaks(value, autoescape=False):  
    116118    return '\n\n'.join(paras)
    117119linebreaks = allow_lazy(linebreaks, six.text_type)
    118120
     121
     122class MLStripper(HTMLParser):
     123    def __init__(self):
     124        HTMLParser.__init__(self)
     125        self.reset()
     126        self.fed = []
     127    def handle_data(self, d):
     128        self.fed.append(d)
     129    def handle_entityref(self, name):
     130        self.fed.append('&%s;' % name)
     131    def handle_charref(self, name):
     132        self.fed.append('&#%s;' % name)
     133    def get_data(self):
     134        return ''.join(self.fed)
     135
    119136def strip_tags(value):
    120137    """Returns the given HTML with all tags stripped."""
    121     return strip_tags_re.sub('', force_text(value))
     138    s = MLStripper()
     139    s.feed(value)
     140    data = s.get_data()
     141    try:
     142        res = s.close()
     143    except Exception as e:
     144        data += s.rawdata
     145    return data
    122146strip_tags = allow_lazy(strip_tags)
    123147
    124148def remove_tags(html, tags):
  • tests/utils_tests/html.py

    diff --git a/tests/utils_tests/html.py b/tests/utils_tests/html.py
    index 090cc32..c3e9f7c 100644
    a b import os  
    55
    66from django.utils import html
    77from django.utils._os import upath
     8from django.utils.encoding import force_text
    89from django.utils.unittest import TestCase
    910
    1011
    class TestUtilsHtml(TestCase):  
    6364    def test_strip_tags(self):
    6465        f = html.strip_tags
    6566        items = (
     67            ('<p>See: &#39;&eacute; is an apostrophe followed by e acute</p>',
     68             'See: &#39;&eacute; is an apostrophe followed by e acute'),
    6669            ('<adf>a', 'a'),
    6770            ('</adf>a', 'a'),
    6871            ('<asdf><asdf>e', 'e'),
    69             ('<f', '<f'),
     72            ('hi, <f x', 'hi, <f x'),
    7073            ('</fe', '</fe'),
    7174            ('<x>b<y>', 'b'),
    7275            ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
    class TestUtilsHtml(TestCase):  
    8184        for filename in ('strip_tags1.html', 'strip_tags2.txt'):
    8285            path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
    8386            with open(path, 'r') as fp:
     87                content = force_text(fp.read())
    8488                start = datetime.now()
    85                 stripped = html.strip_tags(fp.read())
     89                stripped = html.strip_tags(content)
    8690                elapsed = datetime.now() - start
    8791            self.assertEqual(elapsed.seconds, 0)
    8892            self.assertIn("Please try again.", stripped)
Back to Top