Ticket #5821: 5821_decode_entities.patch
File 5821_decode_entities.patch, 4.1 KB (added by , 17 years ago) |
---|
-
django/utils/html.py
2 2 3 3 import re 4 4 import string 5 from htmlentitydefs import name2codepoint 5 6 6 7 from django.utils.encoding import force_unicode 7 8 from django.utils.functional import allow_lazy … … 23 24 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) 24 25 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) 25 26 trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') 27 entity_re = re.compile("&(#?)([Xx]?)(\d+|[A-Fa-f0-9]+|%s);" % '|'.join(name2codepoint)) 28 entity_no_escape_chars_re = re.compile(r"&(#?)([Xx]?)((?!39)(\d+|[A-Fa-f0-9]+)|%s);" % '|'.join([k for k in name2codepoint if k not in ('amp', 'lt', 'gt', 'quot')])) 26 29 del x # Temporary variable 27 30 28 31 def escape(html): … … 53 56 return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value)) 54 57 strip_entities = allow_lazy(strip_entities, unicode) 55 58 59 def _replace_entity(m): 60 entity = m.group(3) 61 if m.group(1) == '#': 62 val = int(entity, m.group(2) == '' and 10 or 16) 63 else: 64 val = name2codepoint[entity] 65 return unichr(val) 66 67 def decode_entities(html, decode_all=False): 68 """ 69 Replaces HTML entities with unicode equivalents. 70 71 Ampersands, quotes and carets are not replaced by default. 72 """ 73 regexp = decode_all and entity_re or entity_no_escape_chars_re 74 return regexp.sub(_replace_entity, force_unicode(html)) 75 decode_entities = allow_lazy(decode_entities, unicode) 76 56 77 def fix_ampersands(value): 57 78 "Return the given HTML with all unencoded ampersands encoded correctly." 58 79 return unencoded_ampersands_re.sub('&', force_unicode(value)) -
tests/regressiontests/utils/tests.py
94 94 for entity in entities: 95 95 for in_pattern, output in patterns: 96 96 self.check_output(f, in_pattern % {'entity': entity}, output) 97 98 def test_decode_entities(self): 99 f = html.decode_entities 100 items = ( 101 ('Ð', u'\xd0'), 102 ('ζ', u'\u03b6'), 103 ('å', u'\xe5'), 104 ('ñ', u'\xf1'), 105 ('ϑ', u'\u03d1'), 106 ('ø', u'\xf8'), 107 ('&', '&'), 108 ('<', '<'), 109 ('>', '>'), 110 ('"', '"'), 111 (''', '''), 112 ) 113 # Substitution patterns for testing the above items. 114 patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb") 115 for value, output in items: 116 for pattern in patterns: 117 self.check_output(f, pattern % value, pattern % output) 118 # Check repeated values. 119 self.check_output(f, value * 2, output * 2) 97 120 121 # Turn the decode_all flag on 122 f2 = lambda s: f(s,decode_all=True) 123 items = ( 124 ('Ð', u'\xd0'), 125 ('ζ', u'\u03b6'), 126 ('å', u'\xe5'), 127 ('ñ', u'\xf1'), 128 ('ϑ', u'\u03d1'), 129 ('ø', u'\xf8'), 130 ('&', '&'), 131 ('<', '<'), 132 ('>', '>'), 133 ('"', '"'), 134 (''', '\''), 135 ) 136 for value, output in items: 137 for pattern in patterns: 138 self.check_output(f2, pattern % value, pattern % output) 139 # Check repeated values. 140 self.check_output(f2, value * 2, output * 2) 141 98 142 def test_fix_ampersands(self): 99 143 f = html.fix_ampersands 100 144 # Strings without ampersands or with ampersands already encoded.