Ticket #19237: strip_tags_parser.diff

File strip_tags_parser.diff, 1.2 KB (added by Simon Litchfield, 6 years ago)

Parser based version of strip_tags filter

  • html.py

    old new  
    3333html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
    3434hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
    3535trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
    36 strip_tags_re = re.compile(r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
    3736
    3837
    3938def escape(text):
     
    116115    return '\n\n'.join(paras)
    117116linebreaks = allow_lazy(linebreaks, six.text_type)
    118117
     118from HTMLParser import HTMLParser
     119
     120class MLStripper(HTMLParser):
     121    def __init__(self):
     122        self.reset()
     123        self.fed = []
     124    def handle_data(self, d):
     125        self.fed.append(d)
     126    def get_data(self):
     127        return ''.join(self.fed)
     128
    119129def strip_tags(value):
    120130    """Returns the given HTML with all tags stripped."""
    121     return strip_tags_re.sub('', force_text(value))
     131    s = MLStripper()
     132    s.feed(value)
     133    return s.get_data()
    122134strip_tags = allow_lazy(strip_tags)
    123135
    124136def remove_tags(html, tags):
Back to Top