Context Navigation

Back to Ticket #6965

Ticket #6965: faster_urlize.diff

File faster_urlize.diff, 4.9 KB (added by floguy, 17 years ago)
Improved speed to about 50x faster. Tests still pass.

django/utils/html.py

 # Configuration for urlize() function.
 LEADING_PUNCTUATION  = ['(', '<', '&lt;']
 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
+url_re = re.compile(r'[A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?')
+email_re = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.I)
 # List of possible strings used for bullets in bulleted lists.
 DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
 …
     return unencoded_ampersands_re.sub('&amp;', force_unicode(value))
 fix_ampersands = allow_lazy(fix_ampersands, unicode)
+def curried_repl(trim_url_limit, nofollow, email=False):
+    """
+    Curries a callback function for replacing emails or urls in text.
+    """
+    def repl(match):
+        url_snippet = match.group()
+        if url_snippet[-1] in TRAILING_PUNCTUATION:
+            trailing_punctuation = url_snippet[-1]
+            url_snippet = url_snippet[:-1]
+        else:
+            trailing_punctuation = False
+        if url_snippet[0] in LEADING_PUNCTUATION:
+            leading_punctuation = url_snippet[0]
+            url_snippet = url_snippet[1:]
+        else:
+            leading_punctuation = False
+        if trim_url_limit is not None and len(url_snippet) > trim_url_limit:
+            display_url = '%s...' % url_snippet[:max(0, trim_url_limit - 3)]
+        else:
+            display_url = url_snippet
+        return u'%s<a href="%s%s"%s>%s</a>%s' % (leading_punctuation or '', email and 'mailto:' or '', url_snippet, nofollow and not '@' in url_snippet and ' rel="nofollow"' or '', display_url, trailing_punctuation or '')
+    return repl
 def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
     """
     Converts any URLs in text into clickable links.
 …
     If nofollow is True, the URLs in link text will get a rel="nofollow"
     attribute.
     """
     if autoescape:
         trim_url = lambda x, limit=trim_url_limit: conditional_escape(limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x)
+    if autoescape and not isinstance(text, SafeData):
+        text = escape(text)
     else:
+        trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
+    safe_input = isinstance(text, SafeData)
+    words = word_split_re.split(force_unicode(text))
+    nofollow_attr = nofollow and ' rel="nofollow"' or ''
+    for i, word in enumerate(words):
+        match = punctuation_re.match(word)
+        if match:
+            lead, middle, trail = match.groups()
+            if safe_input:
+                middle = mark_safe(middle)
+            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
+                    len(middle) > 0 and middle[0] in string.ascii_letters + string.digits and \
+                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
+                middle = 'http://%s' % middle
+            if middle.startswith('http://') or middle.startswith('https://'):
+                url = urlquote(middle, safe='/&=:;#?+*')
+                if autoescape and not safe_input:
+                    url = escape(url)
+                trimmed_url = trim_url(middle)
+                middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr,
+                        trimmed_url)
+            elif '@' in middle and not middle.startswith('www.') and \
+                      not ':' in middle and simple_email_re.match(middle):
+                if autoescape:
+                    middle = conditional_escape(middle)
+                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
+            if lead + middle + trail != word:
+                if autoescape and not safe_input:
+                    lead, trail = escape(lead), escape(trail)
+                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
+            elif autoescape and not safe_input:
+                words[i] = escape(word)
+        elif safe_input:
+            words[i] = mark_safe(word)
+        elif autoescape:
+            words[i] = escape(word)
+    return u''.join(words)
+        text = mark_safe(text)
+    # Check if there are any links in the text at all using Python's built-in
+    # superlinear string search, and short-circuit the algorithm if there is
+    # no text.
+    if "http://" in text or "https://" in text or "www." in text:
+        text = url_re.sub(curried_repl(trim_url_limit, nofollow), text)
+    if "@" in text:
+        text = email_re.sub(curried_repl(trim_url_limit, nofollow, email=True), text)
+    return text
 urlize = allow_lazy(urlize, unicode)
 def clean_html(text):

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #6965: faster_urlize.diff

django/utils/html.py

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us