Ticket #19237: strip_tags_parser.diff
File strip_tags_parser.diff, 1.2 KB (added by , 12 years ago) |
---|
-
html.py
old new 33 33 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) 34 34 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) 35 35 trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') 36 strip_tags_re = re.compile(r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)37 36 38 37 39 38 def escape(text): … … 116 115 return '\n\n'.join(paras) 117 116 linebreaks = allow_lazy(linebreaks, six.text_type) 118 117 118 from HTMLParser import HTMLParser 119 120 class MLStripper(HTMLParser): 121 def __init__(self): 122 self.reset() 123 self.fed = [] 124 def handle_data(self, d): 125 self.fed.append(d) 126 def get_data(self): 127 return ''.join(self.fed) 128 119 129 def strip_tags(value): 120 130 """Returns the given HTML with all tags stripped.""" 121 return strip_tags_re.sub('', force_text(value)) 131 s = MLStripper() 132 s.feed(value) 133 return s.get_data() 122 134 strip_tags = allow_lazy(strip_tags) 123 135 124 136 def remove_tags(html, tags):