Context Navigation

Ticket #7267: 7267.patch

File 7267.patch, 2.3 KB (added by Aymeric Augustin, 14 years ago)

+        )
         for value, output in items:
             self.check_output(f, value, output)
+    def test_clean_html(self):
+        f = html.clean_html
+        items = (
+            (u'<p>I <i>believe</i> in <b>semantic markup</b>!</p>', u'<p>I <em>believe</em> in <strong>semantic markup</strong>!</p>'),
+            (u'I escape & I don\'t <a href="#" target="_blank">target</a>', u'I escape &amp; I don\'t <a href="#" >target</a>'),
+            (u'<p>I kill<br />whitespace</p><p>&nbsp;</p>', u'<p>I kill whitespace</p>'),
+            # also a regression test for #7267: this used to raise an UnicodeDecodeError
+            (u'<p>* foo</p><p>* bar</p>', u'<ul>\n<li> foo</li><li> bar</li>\n</ul>'),
+        )
+        for value, output in items:
+            self.check_output(f, value, output)

 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
 # List of possible strings used for bullets in bulleted lists.
 DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
+DOTS = [u'&middot;', u'*', u'\xe2\x80\xa2', u'&#149;', u'&bull;', u'&#8226;']
 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
 word_split_re = re.compile(r'(\s+)')
 …
     text = html_gunk_re.sub('', text)
     # Convert hard-coded bullets into HTML unordered lists.
     def replace_p_tags(match):
         s = match.group().replace('</p>', '</li>')
+        s = match.group().replace(u'</p>', u'</li>')
         for d in DOTS:
             s = s.replace('<p>%s' % d, '<li>')
+            s = s.replace(u'<p>%s' % d, u'<li>')
         return u'<ul>\n%s\n</ul>' % s
     text = hard_coded_bullets_re.sub(replace_p_tags, text)
     # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom
     # of the text.
     text = trailing_empty_content_re.sub('', text)
+    text = trailing_empty_content_re.sub(u'', text)
     return text
 clean_html = allow_lazy(clean_html, unicode)