| 29 | class LimitingHTMLParser(html5lib.HTMLParser): |
| 30 | def __init__(self, length, words=False): |
| 31 | self.length = length |
| 32 | self.words = words |
| 33 | self._counter = 0 |
| 34 | self._limit_reached = False |
| 35 | super().__init__() |
| 36 | |
| 37 | def reset(self): |
| 38 | self._counter = 0 |
| 39 | self._limit_reached = False |
| 40 | super().reset() |
| 41 | |
| 42 | def normalizedTokens(self): |
| 43 | types = html5lib.constants.tokenTypes |
| 44 | character_type = types["Characters"] |
| 45 | space_type = types["SpaceCharacters"] |
| 46 | for token in self.tokenizer: |
| 47 | if token["type"] in {character_type, space_type}: |
| 48 | if self.words: |
| 49 | if token["type"] == space_type: |
| 50 | yield token # Not relevant in the "words" case |
| 51 | words = token["data"].split() # Very simple word counter |
| 52 | self._counter += len(words) |
| 53 | if self._counter > self.length: # Strip extra words |
| 54 | words = words[:-(self._counter - self.length)] |
| 55 | token["data"] = " ".join(words) |
| 56 | else: |
| 57 | self._counter += len(token["data"]) |
| 58 | if self._counter > self.length: # Strip of extra data |
| 59 | token["data"] = token["data"][:-(self._counter - self.length)] |
| 60 | yield token |
| 61 | else: |
| 62 | yield self.normalizeToken(token) |
| 63 | |
| 64 | if self._limit_reached: |
| 65 | return |
| 66 | |
| 67 | |
155 | | if words and length <= 0: |
156 | | return '' |
157 | | |
158 | | html4_singlets = ( |
159 | | 'br', 'col', 'link', 'base', 'img', |
160 | | 'param', 'area', 'hr', 'input' |
161 | | ) |
162 | | |
163 | | # Count non-HTML chars/words and keep note of open tags |
164 | | pos = 0 |
165 | | end_text_pos = 0 |
166 | | current_len = 0 |
167 | | open_tags = [] |
168 | | |
169 | | regex = re_words if words else re_chars |
170 | | |
171 | | while current_len <= length: |
172 | | m = regex.search(text, pos) |
173 | | if not m: |
174 | | # Checked through whole string |
175 | | break |
176 | | pos = m.end(0) |
177 | | if m.group(1): |
178 | | # It's an actual non-HTML word or char |
179 | | current_len += 1 |
180 | | if current_len == truncate_len: |
181 | | end_text_pos = pos |
182 | | continue |
183 | | # Check for tag |
184 | | tag = re_tag.match(m.group(0)) |
185 | | if not tag or current_len >= truncate_len: |
186 | | # Don't worry about non tags or tags after our truncate point |
187 | | continue |
188 | | closing_tag, tagname, self_closing = tag.groups() |
189 | | # Element names are always case-insensitive |
190 | | tagname = tagname.lower() |
191 | | if self_closing or tagname in html4_singlets: |
192 | | pass |
193 | | elif closing_tag: |
194 | | # Check for match in open tags list |
195 | | try: |
196 | | i = open_tags.index(tagname) |
197 | | except ValueError: |
198 | | pass |
199 | | else: |
200 | | # SGML: An end tag closes, back to the matching start tag, |
201 | | # all unclosed intervening start tags with omitted end tags |
202 | | open_tags = open_tags[i + 1:] |
203 | | else: |
204 | | # Add it to the start of the open tags list |
205 | | open_tags.insert(0, tagname) |
206 | | |
207 | | if current_len <= length: |
208 | | return text |
209 | | out = text[:end_text_pos] |
210 | | truncate_text = self.add_truncation_text('', truncate) |
211 | | if truncate_text: |
212 | | out += truncate_text |
213 | | # Close any tags still open |
214 | | for tag in open_tags: |
215 | | out += '</%s>' % tag |
216 | | # Return string |
217 | | return out |
| 196 | tree = LimitingHTMLParser(length, words).parse(text) |
| 197 | return html5lib.serializer.serialize(tree) |