5 | | class HTMLParser(_HTMLParser.HTMLParser): |
6 | | """ |
7 | | Patched version of stdlib's HTMLParser with patch from: |
8 | | http://bugs.python.org/issue670664 |
9 | | """ |
10 | | def __init__(self): |
11 | | _HTMLParser.HTMLParser.__init__(self) |
12 | | self.cdata_tag = None |
13 | | |
14 | | def set_cdata_mode(self, tag): |
15 | | try: |
16 | | self.interesting = _HTMLParser.interesting_cdata |
17 | | except AttributeError: |
18 | | self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I) |
19 | | self.cdata_tag = tag.lower() |
20 | | |
21 | | def clear_cdata_mode(self): |
22 | | self.interesting = _HTMLParser.interesting_normal |
23 | | self.cdata_tag = None |
24 | | |
25 | | # Internal -- handle starttag, return end or -1 if not terminated |
26 | | def parse_starttag(self, i): |
27 | | self.__starttag_text = None |
28 | | endpos = self.check_for_whole_start_tag(i) |
29 | | if endpos < 0: |
| 7 | use_workaround = ( |
| 8 | (current_version < (2, 6, 8)) or |
| 9 | (current_version >= (2, 7) and current_version < (2, 7, 3)) or |
| 10 | (current_version >= (3, 0) and current_version < (3, 3)) |
| 11 | ) |
| 12 | |
| 13 | if not use_workaround: |
| 14 | HTMLParser = _HTMLParser.HTMLParser |
| 15 | else: |
| 16 | class HTMLParser(_HTMLParser.HTMLParser): |
| 17 | """ |
| 18 | Patched version of stdlib's HTMLParser with patch from: |
| 19 | http://bugs.python.org/issue670664 |
| 20 | """ |
| 21 | def __init__(self): |
| 22 | _HTMLParser.HTMLParser.__init__(self) |
| 23 | self.cdata_tag = None |
| 24 | |
| 25 | def set_cdata_mode(self, tag): |
| 26 | try: |
| 27 | self.interesting = _HTMLParser.interesting_cdata |
| 28 | except AttributeError: |
| 29 | self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I) |
| 30 | self.cdata_tag = tag.lower() |
| 31 | |
| 32 | def clear_cdata_mode(self): |
| 33 | self.interesting = _HTMLParser.interesting_normal |
| 34 | self.cdata_tag = None |
| 35 | |
| 36 | # Internal -- handle starttag, return end or -1 if not terminated |
| 37 | def parse_starttag(self, i): |
| 38 | self.__starttag_text = None |
| 39 | endpos = self.check_for_whole_start_tag(i) |
| 40 | if endpos < 0: |
| 41 | return endpos |
| 42 | rawdata = self.rawdata |
| 43 | self.__starttag_text = rawdata[i:endpos] |
| 44 | |
| 45 | # Now parse the data between i+1 and j into a tag and attrs |
| 46 | attrs = [] |
| 47 | match = _HTMLParser.tagfind.match(rawdata, i + 1) |
| 48 | assert match, 'unexpected call to parse_starttag()' |
| 49 | k = match.end() |
| 50 | self.lasttag = tag = rawdata[i + 1:k].lower() |
| 51 | |
| 52 | while k < endpos: |
| 53 | m = _HTMLParser.attrfind.match(rawdata, k) |
| 54 | if not m: |
| 55 | break |
| 56 | attrname, rest, attrvalue = m.group(1, 2, 3) |
| 57 | if not rest: |
| 58 | attrvalue = None |
| 59 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
| 60 | attrvalue[:1] == '"' == attrvalue[-1:]: |
| 61 | attrvalue = attrvalue[1:-1] |
| 62 | attrvalue = self.unescape(attrvalue) |
| 63 | attrs.append((attrname.lower(), attrvalue)) |
| 64 | k = m.end() |
| 65 | |
| 66 | end = rawdata[k:endpos].strip() |
| 67 | if end not in (">", "/>"): |
| 68 | lineno, offset = self.getpos() |
| 69 | if "\n" in self.__starttag_text: |
| 70 | lineno = lineno + self.__starttag_text.count("\n") |
| 71 | offset = len(self.__starttag_text) \ |
| 72 | - self.__starttag_text.rfind("\n") |
| 73 | else: |
| 74 | offset = offset + len(self.__starttag_text) |
| 75 | self.error("junk characters in start tag: %r" |
| 76 | % (rawdata[k:endpos][:20],)) |
| 77 | if end.endswith('/>'): |
| 78 | # XHTML-style empty tag: <span attr="value" /> |
| 79 | self.handle_startendtag(tag, attrs) |
| 80 | else: |
| 81 | self.handle_starttag(tag, attrs) |
| 82 | if tag in self.CDATA_CONTENT_ELEMENTS: |
| 83 | self.set_cdata_mode(tag) # <--------------------------- Changed |
34 | | # Now parse the data between i+1 and j into a tag and attrs |
35 | | attrs = [] |
36 | | match = _HTMLParser.tagfind.match(rawdata, i + 1) |
37 | | assert match, 'unexpected call to parse_starttag()' |
38 | | k = match.end() |
39 | | self.lasttag = tag = rawdata[i + 1:k].lower() |
40 | | |
41 | | while k < endpos: |
42 | | m = _HTMLParser.attrfind.match(rawdata, k) |
43 | | if not m: |
44 | | break |
45 | | attrname, rest, attrvalue = m.group(1, 2, 3) |
46 | | if not rest: |
47 | | attrvalue = None |
48 | | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
49 | | attrvalue[:1] == '"' == attrvalue[-1:]: |
50 | | attrvalue = attrvalue[1:-1] |
51 | | attrvalue = self.unescape(attrvalue) |
52 | | attrs.append((attrname.lower(), attrvalue)) |
53 | | k = m.end() |
54 | | |
55 | | end = rawdata[k:endpos].strip() |
56 | | if end not in (">", "/>"): |
57 | | lineno, offset = self.getpos() |
58 | | if "\n" in self.__starttag_text: |
59 | | lineno = lineno + self.__starttag_text.count("\n") |
60 | | offset = len(self.__starttag_text) \ |
61 | | - self.__starttag_text.rfind("\n") |
62 | | else: |
63 | | offset = offset + len(self.__starttag_text) |
64 | | self.error("junk characters in start tag: %r" |
65 | | % (rawdata[k:endpos][:20],)) |
66 | | if end.endswith('/>'): |
67 | | # XHTML-style empty tag: <span attr="value" /> |
68 | | self.handle_startendtag(tag, attrs) |
69 | | else: |
70 | | self.handle_starttag(tag, attrs) |
71 | | if tag in self.CDATA_CONTENT_ELEMENTS: |
72 | | self.set_cdata_mode(tag) # <--------------------------- Changed |
73 | | return endpos |
74 | | |
75 | | # Internal -- parse endtag, return end or -1 if incomplete |
76 | | def parse_endtag(self, i): |
77 | | rawdata = self.rawdata |
78 | | assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag" |
79 | | match = _HTMLParser.endendtag.search(rawdata, i + 1) # > |
80 | | if not match: |
81 | | return -1 |
82 | | j = match.end() |
83 | | match = _HTMLParser.endtagfind.match(rawdata, i) # </ + tag + > |
84 | | if not match: |
85 | | if self.cdata_tag is not None: # *** add *** |
86 | | self.handle_data(rawdata[i:j]) # *** add *** |
87 | | return j # *** add *** |
88 | | self.error("bad end tag: %r" % (rawdata[i:j],)) |
89 | | # --- changed start --------------------------------------------------- |
90 | | tag = match.group(1).strip() |
91 | | if self.cdata_tag is not None: |
92 | | if tag.lower() != self.cdata_tag: |
93 | | self.handle_data(rawdata[i:j]) |
94 | | return j |
95 | | # --- changed end ----------------------------------------------------- |
96 | | self.handle_endtag(tag.lower()) |
97 | | self.clear_cdata_mode() |
98 | | return j |
| 86 | # Internal -- parse endtag, return end or -1 if incomplete |
| 87 | def parse_endtag(self, i): |
| 88 | rawdata = self.rawdata |
| 89 | assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag" |
| 90 | match = _HTMLParser.endendtag.search(rawdata, i + 1) # > |
| 91 | if not match: |
| 92 | return -1 |
| 93 | j = match.end() |
| 94 | match = _HTMLParser.endtagfind.match(rawdata, i) # </ + tag + > |
| 95 | if not match: |
| 96 | if self.cdata_tag is not None: # *** add *** |
| 97 | self.handle_data(rawdata[i:j]) # *** add *** |
| 98 | return j # *** add *** |
| 99 | self.error("bad end tag: %r" % (rawdata[i:j],)) |
| 100 | # --- changed start --------------------------------------------------- |
| 101 | tag = match.group(1).strip() |
| 102 | if self.cdata_tag is not None: |
| 103 | if tag.lower() != self.cdata_tag: |
| 104 | self.handle_data(rawdata[i:j]) |
| 105 | return j |
| 106 | # --- changed end ----------------------------------------------------- |
| 107 | self.handle_endtag(tag.lower()) |
| 108 | self.clear_cdata_mode() |
| 109 | return j |