Ticket #5821: 5821_decode_entities.2.patch
File 5821_decode_entities.2.patch, 4.1 KB (added by , 17 years ago) |
---|
-
django/utils/html.py
2 2 3 3 import re 4 4 import string 5 from htmlentitydefs import name2codepoint 5 6 6 7 from django.utils.encoding import force_unicode 7 8 from django.utils.functional import allow_lazy … … 23 24 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) 24 25 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) 25 26 trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') 27 entity_re = re.compile("&(#?)([Xx]?)(\d+|[A-Fa-f0-9]+|%s);" % '|'.join(name2codepoint)) 28 entity_no_escape_chars_re = re.compile(r"&(#?)([Xx]?)((?!39;)(\d+|[A-Fa-f0-9]+)|%s);" % '|'.join([k for k in name2codepoint if k not in ('amp', 'lt', 'gt', 'quot')])) 26 29 del x # Temporary variable 27 30 28 31 def escape(html): … … 53 56 return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value)) 54 57 strip_entities = allow_lazy(strip_entities, unicode) 55 58 59 def _replace_entity(m): 60 entity = m.group(3) 61 if m.group(1) == '#': 62 val = int(entity, m.group(2) == '' and 10 or 16) 63 else: 64 val = name2codepoint[entity] 65 return unichr(val) 66 67 def decode_entities(html, decode_all=False): 68 """ 69 Replaces HTML entities with unicode equivalents. 70 Ampersands, quotes and carets are not replaced by default. 71 72 """ 73 regexp = decode_all and entity_re or entity_no_escape_chars_re 74 return regexp.sub(_replace_entity, force_unicode(html)) 75 decode_entities = allow_lazy(decode_entities, unicode) 76 56 77 def fix_ampersands(value): 57 78 "Return the given HTML with all unencoded ampersands encoded correctly." 58 79 return unencoded_ampersands_re.sub('&', force_unicode(value)) -
tests/regressiontests/utils/tests.py
94 94 for entity in entities: 95 95 for in_pattern, output in patterns: 96 96 self.check_output(f, in_pattern % {'entity': entity}, output) 97 98 def test_decode_entities(self): 99 f = html.decode_entities 100 items = ( 101 ('Ð', u'\xd0'), 102 ('ζ', u'\u03b6'), 103 ('å', u'\xe5'), 104 ('ñ', u'\xf1'), 105 ('ϑ', u'\u03d1'), 106 ('ø', u'\xf8'), 107 ('Ɔ', u'\u0186'), 108 ('';', '';'), 109 (''', '''), 110 ('&', '&'), 111 ('<', '<'), 112 ('>', '>'), 113 ('"', '"'), 114 ) 115 # Substitution patterns for testing the above items. 116 patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb") 117 for value, output in items: 118 for pattern in patterns: 119 self.check_output(f, pattern % value, pattern % output) 120 # Check repeated values. 121 self.check_output(f, value * 2, output * 2) 97 122 123 # Decode all entities, including ampersands, quotes and carets. 124 f2 = lambda s: f(s,decode_all=True) 125 items = ( 126 ('Ð', u'\xd0'), 127 ('ζ', u'\u03b6'), 128 ('Ɔ', u'\u0186'), 129 ('&', '&'), 130 ('<', '<'), 131 ('>', '>'), 132 ('"', '"'), 133 (''', '\''), 134 ('';', '\';'), 135 ) 136 for value, output in items: 137 for pattern in patterns: 138 self.check_output(f2, pattern % value, pattern % output) 139 # Check repeated values. 140 self.check_output(f2, value * 2, output * 2) 141 98 142 def test_fix_ampersands(self): 99 143 f = html.fix_ampersands 100 144 # Strings without ampersands or with ampersands already encoded.