Ticket #6965: faster_urlize.diff

File faster_urlize.diff, 4.9 KB (added by floguy, 16 years ago)

Improved speed to about 50x faster. Tests still pass.

  • django/utils/html.py

     
    1111# Configuration for urlize() function.
    1212LEADING_PUNCTUATION  = ['(', '<', '&lt;']
    1313TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
    14 
     14url_re = re.compile(r'[A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?')
     15email_re = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.I)
    1516# List of possible strings used for bullets in bulleted lists.
    1617DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
    1718
     
    7273    return unencoded_ampersands_re.sub('&amp;', force_unicode(value))
    7374fix_ampersands = allow_lazy(fix_ampersands, unicode)
    7475
     76def curried_repl(trim_url_limit, nofollow, email=False):
     77    """
     78    Curries a callback function for replacing emails or urls in text.
     79    """
     80    def repl(match):
     81        url_snippet = match.group()
     82        if url_snippet[-1] in TRAILING_PUNCTUATION:
     83            trailing_punctuation = url_snippet[-1]
     84            url_snippet = url_snippet[:-1]
     85        else:
     86            trailing_punctuation = False
     87        if url_snippet[0] in LEADING_PUNCTUATION:
     88            leading_punctuation = url_snippet[0]
     89            url_snippet = url_snippet[1:]
     90        else:
     91            leading_punctuation = False
     92        if trim_url_limit is not None and len(url_snippet) > trim_url_limit:
     93            display_url = '%s...' % url_snippet[:max(0, trim_url_limit - 3)]
     94        else:
     95            display_url = url_snippet
     96        return u'%s<a href="%s%s"%s>%s</a>%s' % (leading_punctuation or '', email and 'mailto:' or '', url_snippet, nofollow and not '@' in url_snippet and ' rel="nofollow"' or '', display_url, trailing_punctuation or '')
     97    return repl
     98
    7599def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
    76100    """
    77101    Converts any URLs in text into clickable links.
     
    86110    If nofollow is True, the URLs in link text will get a rel="nofollow"
    87111    attribute.
    88112    """
    89     if autoescape:
    90         trim_url = lambda x, limit=trim_url_limit: conditional_escape(limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x)
     113    if autoescape and not isinstance(text, SafeData):
     114        text = escape(text)
    91115    else:
    92         trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
    93     safe_input = isinstance(text, SafeData)
    94     words = word_split_re.split(force_unicode(text))
    95     nofollow_attr = nofollow and ' rel="nofollow"' or ''
    96     for i, word in enumerate(words):
    97         match = punctuation_re.match(word)
    98         if match:
    99             lead, middle, trail = match.groups()
    100             if safe_input:
    101                 middle = mark_safe(middle)
    102             if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
    103                     len(middle) > 0 and middle[0] in string.ascii_letters + string.digits and \
    104                     (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
    105                 middle = 'http://%s' % middle
    106             if middle.startswith('http://') or middle.startswith('https://'):
    107                 url = urlquote(middle, safe='/&=:;#?+*')
    108                 if autoescape and not safe_input:
    109                     url = escape(url)
    110                 trimmed_url = trim_url(middle)
    111                 middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr,
    112                         trimmed_url)
    113             elif '@' in middle and not middle.startswith('www.') and \
    114                       not ':' in middle and simple_email_re.match(middle):
    115                 if autoescape:
    116                     middle = conditional_escape(middle)
    117                 middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
    118             if lead + middle + trail != word:
    119                 if autoescape and not safe_input:
    120                     lead, trail = escape(lead), escape(trail)
    121                 words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
    122             elif autoescape and not safe_input:
    123                 words[i] = escape(word)
    124         elif safe_input:
    125             words[i] = mark_safe(word)
    126         elif autoescape:
    127             words[i] = escape(word)
    128     return u''.join(words)
     116        text = mark_safe(text)
     117    # Check if there are any links in the text at all using Python's built-in
     118    # superlinear string search, and short-circuit the algorithm if there is
     119    # no text.
     120    if "http://" in text or "https://" in text or "www." in text:
     121        text = url_re.sub(curried_repl(trim_url_limit, nofollow), text)
     122    if "@" in text:
     123        text = email_re.sub(curried_repl(trim_url_limit, nofollow, email=True), text)
     124    return text
    129125urlize = allow_lazy(urlize, unicode)
    130126
    131127def clean_html(text):
Back to Top