Ticket #5821: 5821_decode_entities.2.patch

File 5821_decode_entities.2.patch, 4.1 KB (added by Samuel Adam <samuel.adam@…>, 8 years ago)

decode_entities and test ( fixed regexp )

  • django/utils/html.py

     
    22
    33import re
    44import string
     5from htmlentitydefs import name2codepoint
    56
    67from django.utils.encoding import force_unicode
    78from django.utils.functional import allow_lazy
     
    2324html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
    2425hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
    2526trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
     27entity_re = re.compile("&(#?)([Xx]?)(\d+|[A-Fa-f0-9]+|%s);" % '|'.join(name2codepoint))
     28entity_no_escape_chars_re = re.compile(r"&(#?)([Xx]?)((?!39;)(\d+|[A-Fa-f0-9]+)|%s);" % '|'.join([k for k in name2codepoint if k not in ('amp', 'lt', 'gt', 'quot')]))
    2629del x # Temporary variable
    2730
    2831def escape(html):
     
    5356    return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value))
    5457strip_entities = allow_lazy(strip_entities, unicode)
    5558
     59def _replace_entity(m):
     60    entity = m.group(3)
     61    if m.group(1) == '#':
     62        val = int(entity, m.group(2) == '' and 10 or 16)
     63    else:
     64        val = name2codepoint[entity]
     65    return unichr(val)
     66
     67def decode_entities(html, decode_all=False):
     68    """
     69    Replaces HTML entities with unicode equivalents.
     70    Ampersands, quotes and carets are not replaced by default.
     71   
     72    """
     73    regexp = decode_all and entity_re or entity_no_escape_chars_re
     74    return regexp.sub(_replace_entity, force_unicode(html))
     75decode_entities = allow_lazy(decode_entities, unicode)
     76
    5677def fix_ampersands(value):
    5778    "Return the given HTML with all unencoded ampersands encoded correctly."
    5879    return unencoded_ampersands_re.sub('&amp;', force_unicode(value))
  • tests/regressiontests/utils/tests.py

     
    9494        for entity in entities:
    9595            for in_pattern, output in patterns:
    9696                self.check_output(f, in_pattern % {'entity': entity}, output)
     97               
     98    def test_decode_entities(self):
     99        f = html.decode_entities
     100        items = (
     101            ('&#x00D0;', u'\xd0'),
     102            ('&#950;', u'\u03b6'),
     103            ('&#X00000e5;', u'\xe5'),
     104            ('&ntilde;', u'\xf1'),
     105            ('&thetasym;', u'\u03d1'),
     106            ('&#0000248;', u'\xf8'),
     107            ('&#390;', u'\u0186'),
     108            ('&#39;;', '&#39;;'),
     109            ('&#39;', '&#39;'),
     110            ('&amp;', '&amp;'),
     111            ('&lt;', '&lt;'),
     112            ('&gt;', '&gt;'),
     113            ('&quot;', '&quot;'),
     114        )
     115        # Substitution patterns for testing the above items.
     116        patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb")
     117        for value, output in items:
     118            for pattern in patterns:
     119                self.check_output(f, pattern % value, pattern % output)
     120            # Check repeated values.
     121            self.check_output(f, value * 2, output * 2)
    97122
     123        # Decode all entities, including ampersands, quotes and carets.
     124        f2 = lambda s: f(s,decode_all=True)
     125        items = (
     126            ('&#x00D0;', u'\xd0'),
     127            ('&#950;', u'\u03b6'),
     128            ('&#390;', u'\u0186'),
     129            ('&amp;', '&'),
     130            ('&lt;', '<'),
     131            ('&gt;', '>'),
     132            ('&quot;', '"'),
     133            ('&#39;', '\''),
     134            ('&#39;;', '\';'),
     135        )
     136        for value, output in items:
     137            for pattern in patterns:
     138                self.check_output(f2, pattern % value, pattern % output)
     139            # Check repeated values.
     140            self.check_output(f2, value * 2, output * 2)
     141           
    98142    def test_fix_ampersands(self):
    99143        f = html.fix_ampersands
    100144        # Strings without ampersands or with ampersands already encoded.
Back to Top