Ticket #2027: better_truncatewords_html.2.patch

File better_truncatewords_html.2.patch, 4.6 KB (added by Chris Beaven, 18 years ago)

Teensy change ensuring that self-closing tags are properly identified

  • django/template/defaultfilters.py

     
    101101        value = str(value)
    102102    return truncate_words(value, length)
    103103
     104def truncatewords_html(value, arg):
     105    """
     106    Truncates HTML after a certain number of words
     107
     108    Argument: Number of words to truncate after
     109    """
     110    from django.utils.text import truncate_html_words
     111    try:
     112        length = int(arg)
     113    except ValueError: # invalid literal for int()
     114        return value # Fail silently.
     115    if not isinstance(value, basestring):
     116        value = str(value)
     117    return truncate_html_words(value, length)
     118
    104119def upper(value):
    105120    "Converts a string into all uppercase"
    106121    return value.upper()
     
    481496register.filter(timesince)
    482497register.filter(title)
    483498register.filter(truncatewords)
     499register.filter(truncatewords_html)
    484500register.filter(unordered_list)
    485501register.filter(upper)
    486502register.filter(urlencode)
  • django/utils/text.py

     
    3030            words.append('...')
    3131    return ' '.join(words)
    3232
     33def truncate_html_words(s, num):
     34    """
     35    Truncates html to a certain number of words (not counting tags and comments).
     36    Closes opened tags if they were correctly closed in the given html.
     37    """
     38    length = int(num)
     39    if length <= 0:
     40        return ''
     41    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
     42    # Set up regular expressions
     43    re_words = re.compile(r'&.*?;|<.*?>|([A-Za-z0-9][\w-]*)')
     44    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
     45    # Count non-HTML words and keep note of open tags
     46    pos = 0
     47    ellipsis_pos = 0
     48    words = 0
     49    open_tags = []
     50    while words <= length:
     51        m = re_words.search(s, pos)
     52        if not m:
     53            # Checked through whole string
     54            break
     55        pos = m.end(0)
     56        if m.group(1):
     57            # It's an actual non-HTML word
     58            words += 1
     59            if words == length:
     60                ellipsis_pos = pos
     61            continue
     62        # Check for tag
     63        tag = re_tag.match(m.group(0))
     64        if not tag or ellipsis_pos:
     65            # Don't worry about non tags or tags after our truncate point
     66            continue
     67        closing_tag, tagname, self_closing = tag.groups()
     68        tagname = tagname.lower()  # Element names are always case-insensitive
     69        if self_closing or tagname in html4_singlets:
     70            pass
     71        elif closing_tag:
     72            # Check for match in open tags list
     73            try:
     74                i = open_tags.index(tagname)
     75            except ValueError:
     76                pass
     77            else:
     78                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
     79                open_tags = open_tags[i+1:]
     80        else:
     81            # Add it to the start of the open tags list
     82            open_tags.insert(0, tagname)
     83    if words <= length:
     84        # Don't try to close tags if we don't need to truncate
     85        return s
     86    out = s[:ellipsis_pos] + ' ...'
     87    # Close any tags still open
     88    for tag in open_tags:
     89        out += '</%s>' % tag
     90    # Return string
     91    return out
     92
    3393def get_valid_filename(s):
    3494    """
    3595    Returns the given string converted to a string that can be used for a clean
  • tests/othertests/defaultfilters.py

     
    6464>>> truncatewords('A sentence with a few words in it', 'not a number')
    6565'A sentence with a few words in it'
    6666
     67>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 0)
     68''
    6769
     70>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 2)
     71'<p>one <a href="#">two ...</a></p>'
     72
     73>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 4)
     74'<p>one <a href="#">two - three <br>four ...</a></p>'
     75
     76>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 5)
     77'<p>one <a href="#">two - three <br>four</a> five</p>'
     78
     79>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 100)
     80'<p>one <a href="#">two - three <br>four</a> five</p>'
     81
    6882>>> upper('Mixed case input')
    6983'MIXED CASE INPUT'
    7084
Back to Top