Django

Code

Ticket #6965: faster_urlize.diff

File faster_urlize.diff, 4.9 kB (added by floguy, 9 months ago)

Improved speed to about 50x faster. Tests still pass.

  • django/utils/html.py

    old new  
    1111# Configuration for urlize() function. 
    1212LEADING_PUNCTUATION  = ['(', '<', '&lt;'] 
    1313TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;'] 
    14  
     14url_re = re.compile(r'[A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?') 
     15email_re = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.I) 
    1516# List of possible strings used for bullets in bulleted lists. 
    1617DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;'] 
    1718 
     
    7273    return unencoded_ampersands_re.sub('&amp;', force_unicode(value)) 
    7374fix_ampersands = allow_lazy(fix_ampersands, unicode) 
    7475 
     76def curried_repl(trim_url_limit, nofollow, email=False): 
     77    """ 
     78    Curries a callback function for replacing emails or urls in text. 
     79    """ 
     80    def repl(match): 
     81        url_snippet = match.group() 
     82        if url_snippet[-1] in TRAILING_PUNCTUATION: 
     83            trailing_punctuation = url_snippet[-1] 
     84            url_snippet = url_snippet[:-1] 
     85        else: 
     86            trailing_punctuation = False 
     87        if url_snippet[0] in LEADING_PUNCTUATION: 
     88            leading_punctuation = url_snippet[0] 
     89            url_snippet = url_snippet[1:] 
     90        else: 
     91            leading_punctuation = False 
     92        if trim_url_limit is not None and len(url_snippet) > trim_url_limit: 
     93            display_url = '%s...' % url_snippet[:max(0, trim_url_limit - 3)] 
     94        else: 
     95            display_url = url_snippet 
     96        return u'%s<a href="%s%s"%s>%s</a>%s' % (leading_punctuation or '', email and 'mailto:' or '', url_snippet, nofollow and not '@' in url_snippet and ' rel="nofollow"' or '', display_url, trailing_punctuation or '') 
     97    return repl 
     98 
    7599def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): 
    76100    """ 
    77101    Converts any URLs in text into clickable links. 
     
    86110    If nofollow is True, the URLs in link text will get a rel="nofollow" 
    87111    attribute. 
    88112    """ 
    89     if autoescape
    90         trim_url = lambda x, limit=trim_url_limit: conditional_escape(limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
     113    if autoescape and not isinstance(text, SafeData)
     114        text = escape(text
    91115    else: 
    92         trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x 
    93     safe_input = isinstance(text, SafeData) 
    94     words = word_split_re.split(force_unicode(text)) 
    95     nofollow_attr = nofollow and ' rel="nofollow"' or '' 
    96     for i, word in enumerate(words): 
    97         match = punctuation_re.match(word) 
    98         if match: 
    99             lead, middle, trail = match.groups() 
    100             if safe_input: 
    101                 middle = mark_safe(middle) 
    102             if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ 
    103                     len(middle) > 0 and middle[0] in string.ascii_letters + string.digits and \ 
    104                     (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): 
    105                 middle = 'http://%s' % middle 
    106             if middle.startswith('http://') or middle.startswith('https://'): 
    107                 url = urlquote(middle, safe='/&=:;#?+*') 
    108                 if autoescape and not safe_input: 
    109                     url = escape(url) 
    110                 trimmed_url = trim_url(middle) 
    111                 middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr, 
    112                         trimmed_url) 
    113             elif '@' in middle and not middle.startswith('www.') and \ 
    114                       not ':' in middle and simple_email_re.match(middle): 
    115                 if autoescape: 
    116                     middle = conditional_escape(middle) 
    117                 middle = '<a href="mailto:%s">%s</a>' % (middle, middle) 
    118             if lead + middle + trail != word: 
    119                 if autoescape and not safe_input: 
    120                     lead, trail = escape(lead), escape(trail) 
    121                 words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) 
    122             elif autoescape and not safe_input: 
    123                 words[i] = escape(word) 
    124         elif safe_input: 
    125             words[i] = mark_safe(word) 
    126         elif autoescape: 
    127             words[i] = escape(word) 
    128     return u''.join(words) 
     116        text = mark_safe(text) 
     117    # Check if there are any links in the text at all using Python's built-in  
     118    # superlinear string search, and short-circuit the algorithm if there is  
     119    # no text. 
     120    if "http://" in text or "https://" in text or "www." in text: 
     121        text = url_re.sub(curried_repl(trim_url_limit, nofollow), text) 
     122    if "@" in text: 
     123        text = email_re.sub(curried_repl(trim_url_limit, nofollow, email=True), text) 
     124    return text 
    129125urlize = allow_lazy(urlize, unicode) 
    130126 
    131127def clean_html(text):