Context Navigation

Back to Ticket #5821

Ticket #5821: 5821_decode_entities.patch

File 5821_decode_entities.patch, 4.1 KB (added by Samuel Adam <samuel.adam@…>, 17 years ago)
decode_entities and tests

django/utils/html.py

 import re
 import string
+from htmlentitydefs import name2codepoint
 from django.utils.encoding import force_unicode
 from django.utils.functional import allow_lazy
 …
 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
 trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
+entity_re = re.compile("&(#?)([Xx]?)(\d+|[A-Fa-f0-9]+|%s);" % '|'.join(name2codepoint))
+entity_no_escape_chars_re = re.compile(r"&(#?)([Xx]?)((?!39)(\d+|[A-Fa-f0-9]+)|%s);" % '|'.join([k for k in name2codepoint if k not in ('amp', 'lt', 'gt', 'quot')]))
 del x # Temporary variable
 def escape(html):
 …
     return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value))
 strip_entities = allow_lazy(strip_entities, unicode)
+def _replace_entity(m):
+    entity = m.group(3)
+    if m.group(1) == '#':
+        val = int(entity, m.group(2) == '' and 10 or 16)
+    else:
+        val = name2codepoint[entity]
+    return unichr(val)
+def decode_entities(html, decode_all=False):
+    """
+    Replaces HTML entities with unicode equivalents.
+    Ampersands, quotes and carets are not replaced by default.
+    """
+    regexp = decode_all and entity_re or entity_no_escape_chars_re
+    return regexp.sub(_replace_entity, force_unicode(html))
+decode_entities = allow_lazy(decode_entities, unicode)
 def fix_ampersands(value):
     "Return the given HTML with all unencoded ampersands encoded correctly."
     return unencoded_ampersands_re.sub('&amp;', force_unicode(value))

tests/regressiontests/utils/tests.py

         for entity in entities:
             for in_pattern, output in patterns:
                 self.check_output(f, in_pattern % {'entity': entity}, output)
+    def test_decode_entities(self):
+        f = html.decode_entities
+        items = (
+            ('&#x00D0;', u'\xd0'),
+            ('&#950;', u'\u03b6'),
+            ('&#X00000e5;', u'\xe5'),
+            ('&ntilde;', u'\xf1'),
+            ('&thetasym;', u'\u03d1'),
+            ('&#0000248;', u'\xf8'),
+            ('&amp;', '&amp;'),
+            ('&lt;', '&lt;'),
+            ('&gt;', '&gt;'),
+            ('&quot;', '&quot;'),
+            ('&#39;', '&#39;'),
+        )
+        # Substitution patterns for testing the above items.
+        patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb")
+        for value, output in items:
+            for pattern in patterns:
+                self.check_output(f, pattern % value, pattern % output)
+            # Check repeated values.
+            self.check_output(f, value * 2, output * 2)
+        # Turn the decode_all flag on
+        f2 = lambda s: f(s,decode_all=True)
+        items = (
+            ('&#x00D0;', u'\xd0'),
+            ('&#950;', u'\u03b6'),
+            ('&#X00000e5;', u'\xe5'),
+            ('&ntilde;', u'\xf1'),
+            ('&thetasym;', u'\u03d1'),
+            ('&#0000248;', u'\xf8'),
+            ('&amp;', '&'),
+            ('&lt;', '<'),
+            ('&gt;', '>'),
+            ('&quot;', '"'),
+            ('&#39;', '\''),
+        )
+        for value, output in items:
+            for pattern in patterns:
+                self.check_output(f2, pattern % value, pattern % output)
+            # Check repeated values.
+            self.check_output(f2, value * 2, output * 2)
     def test_fix_ampersands(self):
         f = html.fix_ampersands
         # Strings without ampersands or with ampersands already encoded.

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #5821: 5821_decode_entities.patch

django/utils/html.py

tests/regressiontests/utils/tests.py

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us