Code

Ticket #18239: 01_use_stdlib_htmlparser_when_possible.diff

File 01_use_stdlib_htmlparser_when_possible.diff, 8.9 KB (added by rhertzog, 2 years ago)

Patch for Django 1.4.1

Line 
1Description: Do not use Django's custom HTMLParser when the stdlib version works
2 Django provided its own HTMLParser derived from stdlib's HTMLParser to
3 work around a bug. Unfortunately, this derived object breaks when the
4 stdlib's HTMLParser is fixed.
5 .
6 Thus we modify Django to only use the derived objects with Python
7 versions which are known to be affected by the problem.
8Author: David Watson <david@planetwatson.co.uk>
9Reviewed-by: Raphaël Hertzog <hertzog@debian.org>
10Bug: https://code.djangoproject.com/ticket/18239
11Bug-Debian: http://bugs.debian.org/683648
12
13--- python-django-1.4.1.orig/django/utils/html_parser.py
14+++ python-django-1.4.1/django/utils/html_parser.py
15@@ -1,98 +1,109 @@
16 import HTMLParser as _HTMLParser
17 import re
18 
19+import sys
20+current_version = sys.version_info
21 
22-class HTMLParser(_HTMLParser.HTMLParser):
23-    """
24-    Patched version of stdlib's HTMLParser with patch from:
25-    http://bugs.python.org/issue670664
26-    """
27-    def __init__(self):
28-        _HTMLParser.HTMLParser.__init__(self)
29-        self.cdata_tag = None
30-
31-    def set_cdata_mode(self, tag):
32-        try:
33-            self.interesting = _HTMLParser.interesting_cdata
34-        except AttributeError:
35-            self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
36-        self.cdata_tag = tag.lower()
37-
38-    def clear_cdata_mode(self):
39-        self.interesting = _HTMLParser.interesting_normal
40-        self.cdata_tag = None
41-
42-    # Internal -- handle starttag, return end or -1 if not terminated
43-    def parse_starttag(self, i):
44-        self.__starttag_text = None
45-        endpos = self.check_for_whole_start_tag(i)
46-        if endpos < 0:
47+use_workaround = (
48+    (current_version < (2, 6, 8)) or
49+    (current_version >= (2, 7) and current_version < (2, 7, 3)) or
50+    (current_version >= (3, 0) and current_version < (3, 3))
51+    )
52+
53+if not use_workaround:
54+    HTMLParser = _HTMLParser.HTMLParser
55+else:
56+    class HTMLParser(_HTMLParser.HTMLParser):
57+        """
58+        Patched version of stdlib's HTMLParser with patch from:
59+        http://bugs.python.org/issue670664
60+        """
61+        def __init__(self):
62+            _HTMLParser.HTMLParser.__init__(self)
63+            self.cdata_tag = None
64+
65+        def set_cdata_mode(self, tag):
66+            try:
67+                self.interesting = _HTMLParser.interesting_cdata
68+            except AttributeError:
69+                self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
70+            self.cdata_tag = tag.lower()
71+
72+        def clear_cdata_mode(self):
73+            self.interesting = _HTMLParser.interesting_normal
74+            self.cdata_tag = None
75+
76+        # Internal -- handle starttag, return end or -1 if not terminated
77+        def parse_starttag(self, i):
78+            self.__starttag_text = None
79+            endpos = self.check_for_whole_start_tag(i)
80+            if endpos < 0:
81+                return endpos
82+            rawdata = self.rawdata
83+            self.__starttag_text = rawdata[i:endpos]
84+
85+            # Now parse the data between i+1 and j into a tag and attrs
86+            attrs = []
87+            match = _HTMLParser.tagfind.match(rawdata, i + 1)
88+            assert match, 'unexpected call to parse_starttag()'
89+            k = match.end()
90+            self.lasttag = tag = rawdata[i + 1:k].lower()
91+
92+            while k < endpos:
93+                m = _HTMLParser.attrfind.match(rawdata, k)
94+                if not m:
95+                    break
96+                attrname, rest, attrvalue = m.group(1, 2, 3)
97+                if not rest:
98+                    attrvalue = None
99+                elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
100+                     attrvalue[:1] == '"' == attrvalue[-1:]:
101+                    attrvalue = attrvalue[1:-1]
102+                    attrvalue = self.unescape(attrvalue)
103+                attrs.append((attrname.lower(), attrvalue))
104+                k = m.end()
105+
106+            end = rawdata[k:endpos].strip()
107+            if end not in (">", "/>"):
108+                lineno, offset = self.getpos()
109+                if "\n" in self.__starttag_text:
110+                    lineno = lineno + self.__starttag_text.count("\n")
111+                    offset = len(self.__starttag_text) \
112+                             - self.__starttag_text.rfind("\n")
113+                else:
114+                    offset = offset + len(self.__starttag_text)
115+                self.error("junk characters in start tag: %r"
116+                           % (rawdata[k:endpos][:20],))
117+            if end.endswith('/>'):
118+                # XHTML-style empty tag: <span attr="value" />
119+                self.handle_startendtag(tag, attrs)
120+            else:
121+                self.handle_starttag(tag, attrs)
122+                if tag in self.CDATA_CONTENT_ELEMENTS:
123+                    self.set_cdata_mode(tag) # <--------------------------- Changed
124             return endpos
125-        rawdata = self.rawdata
126-        self.__starttag_text = rawdata[i:endpos]
127 
128-        # Now parse the data between i+1 and j into a tag and attrs
129-        attrs = []
130-        match = _HTMLParser.tagfind.match(rawdata, i + 1)
131-        assert match, 'unexpected call to parse_starttag()'
132-        k = match.end()
133-        self.lasttag = tag = rawdata[i + 1:k].lower()
134-
135-        while k < endpos:
136-            m = _HTMLParser.attrfind.match(rawdata, k)
137-            if not m:
138-                break
139-            attrname, rest, attrvalue = m.group(1, 2, 3)
140-            if not rest:
141-                attrvalue = None
142-            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
143-                 attrvalue[:1] == '"' == attrvalue[-1:]:
144-                attrvalue = attrvalue[1:-1]
145-                attrvalue = self.unescape(attrvalue)
146-            attrs.append((attrname.lower(), attrvalue))
147-            k = m.end()
148-
149-        end = rawdata[k:endpos].strip()
150-        if end not in (">", "/>"):
151-            lineno, offset = self.getpos()
152-            if "\n" in self.__starttag_text:
153-                lineno = lineno + self.__starttag_text.count("\n")
154-                offset = len(self.__starttag_text) \
155-                         - self.__starttag_text.rfind("\n")
156-            else:
157-                offset = offset + len(self.__starttag_text)
158-            self.error("junk characters in start tag: %r"
159-                       % (rawdata[k:endpos][:20],))
160-        if end.endswith('/>'):
161-            # XHTML-style empty tag: <span attr="value" />
162-            self.handle_startendtag(tag, attrs)
163-        else:
164-            self.handle_starttag(tag, attrs)
165-            if tag in self.CDATA_CONTENT_ELEMENTS:
166-                self.set_cdata_mode(tag) # <--------------------------- Changed
167-        return endpos
168-
169-    # Internal -- parse endtag, return end or -1 if incomplete
170-    def parse_endtag(self, i):
171-        rawdata = self.rawdata
172-        assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
173-        match = _HTMLParser.endendtag.search(rawdata, i + 1) # >
174-        if not match:
175-            return -1
176-        j = match.end()
177-        match = _HTMLParser.endtagfind.match(rawdata, i) # </ + tag + >
178-        if not match:
179-            if self.cdata_tag is not None: # *** add ***
180-                self.handle_data(rawdata[i:j]) # *** add ***
181-                return j # *** add ***
182-            self.error("bad end tag: %r" % (rawdata[i:j],))
183-        # --- changed start ---------------------------------------------------
184-        tag = match.group(1).strip()
185-        if self.cdata_tag is not None:
186-            if tag.lower() != self.cdata_tag:
187-                self.handle_data(rawdata[i:j])
188-                return j
189-        # --- changed end -----------------------------------------------------
190-        self.handle_endtag(tag.lower())
191-        self.clear_cdata_mode()
192-        return j
193+        # Internal -- parse endtag, return end or -1 if incomplete
194+        def parse_endtag(self, i):
195+            rawdata = self.rawdata
196+            assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
197+            match = _HTMLParser.endendtag.search(rawdata, i + 1) # >
198+            if not match:
199+                return -1
200+            j = match.end()
201+            match = _HTMLParser.endtagfind.match(rawdata, i) # </ + tag + >
202+            if not match:
203+                if self.cdata_tag is not None: # *** add ***
204+                    self.handle_data(rawdata[i:j]) # *** add ***
205+                    return j # *** add ***
206+                self.error("bad end tag: %r" % (rawdata[i:j],))
207+            # --- changed start ---------------------------------------------------
208+            tag = match.group(1).strip()
209+            if self.cdata_tag is not None:
210+                if tag.lower() != self.cdata_tag:
211+                    self.handle_data(rawdata[i:j])
212+                    return j
213+            # --- changed end -----------------------------------------------------
214+            self.handle_endtag(tag.lower())
215+            self.clear_cdata_mode()
216+            return j