Context Navigation

Back to Ticket #30686

Ticket #30686: possible-html5lib-truncator-implementation.patch

File possible-html5lib-truncator-implementation.patch, 4.6 KB (added by Carlton Gibson, 5 years ago)
Example implemetation of _truncate_html() using html5lib, by Florian Apolloner

django/utils/text.py

diff --git a/django/utils/text.py b/django/utils/text.py
index e9b7dcc72b..7ca34ccecc 100644

                import warnings
 from gzip import GzipFile
 from io import BytesIO
+import html5lib
 from django.utils.deprecation import RemovedInDjango40Warning
 from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
 from django.utils.translation import gettext as _, gettext_lazy, pgettext
-…
+               re_newlines = re.compile(r'\r\n|\r')  # Used in normalize_newlines
 re_camel_case = re.compile(r'(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))')
+class LimitingHTMLParser(html5lib.HTMLParser):
+    def __init__(self, length, words=False):
+        self.length = length
+        self.words = words
+        self._counter = 0
+        self._limit_reached = False
+        super().__init__()
+    def reset(self):
+        self._counter = 0
+        self._limit_reached = False
+        super().reset()
+    def normalizedTokens(self):
+        types = html5lib.constants.tokenTypes
+        character_type = types["Characters"]
+        space_type = types["SpaceCharacters"]
+        for token in self.tokenizer:
+            if token["type"] in {character_type, space_type}:
+                if self.words:
+                    if token["type"] == space_type:
+                        yield token  # Not relevant in the "words" case
+                    words = token["data"].split()  # Very simple word counter
+                    self._counter += len(words)
+                    if self._counter > self.length:  # Strip extra words
+                        words = words[:-(self._counter - self.length)]
+                    token["data"] = " ".join(words)
+                else:
+                    self._counter += len(token["data"])
+                    if self._counter > self.length:  # Strip of extra data
+                        token["data"] = token["data"][:-(self._counter - self.length)]
+                yield token
+            else:
+                yield self.normalizeToken(token)
+            if self._limit_reached:
+                return
 @keep_lazy_text
 def wrap(text, width):
     """
-…
+               class Truncator(SimpleLazyObject):
         Preserve newlines in the HTML.
         """
+        if words and length <= 0:
+            return ''
+        html4_singlets = (
+            'br', 'col', 'link', 'base', 'img',
+            'param', 'area', 'hr', 'input'
+        )
+        # Count non-HTML chars/words and keep note of open tags
+        pos = 0
+        end_text_pos = 0
+        current_len = 0
+        open_tags = []
+        regex = re_words if words else re_chars
+        while current_len <= length:
+            m = regex.search(text, pos)
+            if not m:
+                # Checked through whole string
+                break
+            pos = m.end(0)
+            if m.group(1):
+                # It's an actual non-HTML word or char
+                current_len += 1
+                if current_len == truncate_len:
+                    end_text_pos = pos
+                continue
+            # Check for tag
+            tag = re_tag.match(m.group(0))
+            if not tag or current_len >= truncate_len:
+                # Don't worry about non tags or tags after our truncate point
+                continue
+            closing_tag, tagname, self_closing = tag.groups()
+            # Element names are always case-insensitive
+            tagname = tagname.lower()
+            if self_closing or tagname in html4_singlets:
+                pass
+            elif closing_tag:
+                # Check for match in open tags list
+                try:
+                    i = open_tags.index(tagname)
+                except ValueError:
+                    pass
+                else:
+                    # SGML: An end tag closes, back to the matching start tag,
+                    # all unclosed intervening start tags with omitted end tags
+                    open_tags = open_tags[i + 1:]
+            else:
+                # Add it to the start of the open tags list
+                open_tags.insert(0, tagname)
+        if current_len <= length:
+            return text
+        out = text[:end_text_pos]
+        truncate_text = self.add_truncation_text('', truncate)
+        if truncate_text:
+            out += truncate_text
+        # Close any tags still open
+        for tag in open_tags:
+            out += '</%s>' % tag
+        # Return string
+        return out
+        tree = LimitingHTMLParser(length, words).parse(text)
+        return html5lib.serializer.serialize(tree)
 @keep_lazy_text

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #30686: possible-html5lib-truncator-implementation.patch

django/utils/text.py

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us