Ticket #5821: 5821_decode_entities.patch

File 5821_decode_entities.patch, 4.1 KB (added by Samuel Adam <samuel.adam@…>, 8 years ago)

decode_entities and tests

  • django/utils/html.py

     
    22
    33import re
    44import string
     5from htmlentitydefs import name2codepoint
    56
    67from django.utils.encoding import force_unicode
    78from django.utils.functional import allow_lazy
     
    2324html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
    2425hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
    2526trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
     27entity_re = re.compile("&(#?)([Xx]?)(\d+|[A-Fa-f0-9]+|%s);" % '|'.join(name2codepoint))
     28entity_no_escape_chars_re = re.compile(r"&(#?)([Xx]?)((?!39)(\d+|[A-Fa-f0-9]+)|%s);" % '|'.join([k for k in name2codepoint if k not in ('amp', 'lt', 'gt', 'quot')]))
    2629del x # Temporary variable
    2730
    2831def escape(html):
     
    5356    return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value))
    5457strip_entities = allow_lazy(strip_entities, unicode)
    5558
     59def _replace_entity(m):
     60    entity = m.group(3)
     61    if m.group(1) == '#':
     62        val = int(entity, m.group(2) == '' and 10 or 16)
     63    else:
     64        val = name2codepoint[entity]
     65    return unichr(val)
     66
     67def decode_entities(html, decode_all=False):
     68    """
     69    Replaces HTML entities with unicode equivalents.
     70   
     71    Ampersands, quotes and carets are not replaced by default.
     72    """
     73    regexp = decode_all and entity_re or entity_no_escape_chars_re
     74    return regexp.sub(_replace_entity, force_unicode(html))
     75decode_entities = allow_lazy(decode_entities, unicode)
     76
    5677def fix_ampersands(value):
    5778    "Return the given HTML with all unencoded ampersands encoded correctly."
    5879    return unencoded_ampersands_re.sub('&amp;', force_unicode(value))
  • tests/regressiontests/utils/tests.py

     
    9494        for entity in entities:
    9595            for in_pattern, output in patterns:
    9696                self.check_output(f, in_pattern % {'entity': entity}, output)
     97               
     98    def test_decode_entities(self):
     99        f = html.decode_entities
     100        items = (
     101            ('&#x00D0;', u'\xd0'),
     102            ('&#950;', u'\u03b6'),
     103            ('&#X00000e5;', u'\xe5'),
     104            ('&ntilde;', u'\xf1'),
     105            ('&thetasym;', u'\u03d1'),
     106            ('&#0000248;', u'\xf8'),
     107            ('&amp;', '&amp;'),
     108            ('&lt;', '&lt;'),
     109            ('&gt;', '&gt;'),
     110            ('&quot;', '&quot;'),
     111            ('&#39;', '&#39;'),
     112        )
     113        # Substitution patterns for testing the above items.
     114        patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb")
     115        for value, output in items:
     116            for pattern in patterns:
     117                self.check_output(f, pattern % value, pattern % output)
     118            # Check repeated values.
     119            self.check_output(f, value * 2, output * 2)
    97120
     121        # Turn the decode_all flag on
     122        f2 = lambda s: f(s,decode_all=True)
     123        items = (
     124            ('&#x00D0;', u'\xd0'),
     125            ('&#950;', u'\u03b6'),
     126            ('&#X00000e5;', u'\xe5'),
     127            ('&ntilde;', u'\xf1'),
     128            ('&thetasym;', u'\u03d1'),
     129            ('&#0000248;', u'\xf8'),
     130            ('&amp;', '&'),
     131            ('&lt;', '<'),
     132            ('&gt;', '>'),
     133            ('&quot;', '"'),
     134            ('&#39;', '\''),
     135        )
     136        for value, output in items:
     137            for pattern in patterns:
     138                self.check_output(f2, pattern % value, pattern % output)
     139            # Check repeated values.
     140            self.check_output(f2, value * 2, output * 2)
     141           
    98142    def test_fix_ampersands(self):
    99143        f = html.fix_ampersands
    100144        # Strings without ampersands or with ampersands already encoded.
Back to Top