Code

Ticket #2027: better_truncatewords_html.2.patch

File better_truncatewords_html.2.patch, 4.6 KB (added by SmileyChris, 8 years ago)

Teensy change ensuring that self-closing tags are properly identified

  • django/template/defaultfilters.py

     
    101101        value = str(value) 
    102102    return truncate_words(value, length) 
    103103 
     104def truncatewords_html(value, arg): 
     105    """ 
     106    Truncates HTML after a certain number of words 
     107 
     108    Argument: Number of words to truncate after 
     109    """ 
     110    from django.utils.text import truncate_html_words 
     111    try: 
     112        length = int(arg) 
     113    except ValueError: # invalid literal for int() 
     114        return value # Fail silently. 
     115    if not isinstance(value, basestring): 
     116        value = str(value) 
     117    return truncate_html_words(value, length) 
     118 
    104119def upper(value): 
    105120    "Converts a string into all uppercase" 
    106121    return value.upper() 
     
    481496register.filter(timesince) 
    482497register.filter(title) 
    483498register.filter(truncatewords) 
     499register.filter(truncatewords_html) 
    484500register.filter(unordered_list) 
    485501register.filter(upper) 
    486502register.filter(urlencode) 
  • django/utils/text.py

     
    3030            words.append('...') 
    3131    return ' '.join(words) 
    3232 
     33def truncate_html_words(s, num): 
     34    """ 
     35    Truncates html to a certain number of words (not counting tags and comments). 
     36    Closes opened tags if they were correctly closed in the given html. 
     37    """ 
     38    length = int(num) 
     39    if length <= 0: 
     40        return '' 
     41    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') 
     42    # Set up regular expressions 
     43    re_words = re.compile(r'&.*?;|<.*?>|([A-Za-z0-9][\w-]*)') 
     44    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>') 
     45    # Count non-HTML words and keep note of open tags 
     46    pos = 0 
     47    ellipsis_pos = 0 
     48    words = 0 
     49    open_tags = [] 
     50    while words <= length: 
     51        m = re_words.search(s, pos) 
     52        if not m: 
     53            # Checked through whole string 
     54            break 
     55        pos = m.end(0) 
     56        if m.group(1): 
     57            # It's an actual non-HTML word 
     58            words += 1 
     59            if words == length: 
     60                ellipsis_pos = pos 
     61            continue 
     62        # Check for tag 
     63        tag = re_tag.match(m.group(0)) 
     64        if not tag or ellipsis_pos: 
     65            # Don't worry about non tags or tags after our truncate point 
     66            continue 
     67        closing_tag, tagname, self_closing = tag.groups() 
     68        tagname = tagname.lower()  # Element names are always case-insensitive 
     69        if self_closing or tagname in html4_singlets: 
     70            pass 
     71        elif closing_tag: 
     72            # Check for match in open tags list 
     73            try: 
     74                i = open_tags.index(tagname) 
     75            except ValueError: 
     76                pass 
     77            else: 
     78                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags 
     79                open_tags = open_tags[i+1:] 
     80        else: 
     81            # Add it to the start of the open tags list 
     82            open_tags.insert(0, tagname) 
     83    if words <= length: 
     84        # Don't try to close tags if we don't need to truncate 
     85        return s 
     86    out = s[:ellipsis_pos] + ' ...' 
     87    # Close any tags still open 
     88    for tag in open_tags: 
     89        out += '</%s>' % tag 
     90    # Return string 
     91    return out 
     92 
    3393def get_valid_filename(s): 
    3494    """ 
    3595    Returns the given string converted to a string that can be used for a clean 
  • tests/othertests/defaultfilters.py

     
    6464>>> truncatewords('A sentence with a few words in it', 'not a number') 
    6565'A sentence with a few words in it' 
    6666 
     67>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 0) 
     68'' 
    6769 
     70>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 2) 
     71'<p>one <a href="#">two ...</a></p>' 
     72 
     73>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 4) 
     74'<p>one <a href="#">two - three <br>four ...</a></p>' 
     75 
     76>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 5) 
     77'<p>one <a href="#">two - three <br>four</a> five</p>' 
     78 
     79>>> truncatewords_html('<p>one <a href="#">two - three <br>four</a> five</p>', 100) 
     80'<p>one <a href="#">two - three <br>four</a> five</p>' 
     81 
    6882>>> upper('Mixed case input') 
    6983'MIXED CASE INPUT' 
    7084