Code

Ticket #6965: faster_urlize.diff

File faster_urlize.diff, 4.9 KB (added by floguy, 6 years ago)

Improved speed to about 50x faster. Tests still pass.

Line 
1Index: django/utils/html.py
2===================================================================
3--- django/utils/html.py        (revision 7374)
4+++ django/utils/html.py        (working copy)
5@@ -11,7 +11,8 @@
6 # Configuration for urlize() function.
7 LEADING_PUNCTUATION  = ['(', '<', '&lt;']
8 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
9-
10+url_re = re.compile(r'[A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?')
11+email_re = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.I)
12 # List of possible strings used for bullets in bulleted lists.
13 DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
14 
15@@ -72,6 +73,29 @@
16     return unencoded_ampersands_re.sub('&amp;', force_unicode(value))
17 fix_ampersands = allow_lazy(fix_ampersands, unicode)
18 
19+def curried_repl(trim_url_limit, nofollow, email=False):
20+    """
21+    Curries a callback function for replacing emails or urls in text.
22+    """
23+    def repl(match):
24+        url_snippet = match.group()
25+        if url_snippet[-1] in TRAILING_PUNCTUATION:
26+            trailing_punctuation = url_snippet[-1]
27+            url_snippet = url_snippet[:-1]
28+        else:
29+            trailing_punctuation = False
30+        if url_snippet[0] in LEADING_PUNCTUATION:
31+            leading_punctuation = url_snippet[0]
32+            url_snippet = url_snippet[1:]
33+        else:
34+            leading_punctuation = False
35+        if trim_url_limit is not None and len(url_snippet) > trim_url_limit:
36+            display_url = '%s...' % url_snippet[:max(0, trim_url_limit - 3)]
37+        else:
38+            display_url = url_snippet
39+        return u'%s<a href="%s%s"%s>%s</a>%s' % (leading_punctuation or '', email and 'mailto:' or '', url_snippet, nofollow and not '@' in url_snippet and ' rel="nofollow"' or '', display_url, trailing_punctuation or '')
40+    return repl
41+
42 def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
43     """
44     Converts any URLs in text into clickable links.
45@@ -86,46 +110,18 @@
46     If nofollow is True, the URLs in link text will get a rel="nofollow"
47     attribute.
48     """
49-    if autoescape:
50-        trim_url = lambda x, limit=trim_url_limit: conditional_escape(limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x)
51+    if autoescape and not isinstance(text, SafeData):
52+        text = escape(text)
53     else:
54-        trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
55-    safe_input = isinstance(text, SafeData)
56-    words = word_split_re.split(force_unicode(text))
57-    nofollow_attr = nofollow and ' rel="nofollow"' or ''
58-    for i, word in enumerate(words):
59-        match = punctuation_re.match(word)
60-        if match:
61-            lead, middle, trail = match.groups()
62-            if safe_input:
63-                middle = mark_safe(middle)
64-            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
65-                    len(middle) > 0 and middle[0] in string.ascii_letters + string.digits and \
66-                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
67-                middle = 'http://%s' % middle
68-            if middle.startswith('http://') or middle.startswith('https://'):
69-                url = urlquote(middle, safe='/&=:;#?+*')
70-                if autoescape and not safe_input:
71-                    url = escape(url)
72-                trimmed_url = trim_url(middle)
73-                middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr,
74-                        trimmed_url)
75-            elif '@' in middle and not middle.startswith('www.') and \
76-                      not ':' in middle and simple_email_re.match(middle):
77-                if autoescape:
78-                    middle = conditional_escape(middle)
79-                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
80-            if lead + middle + trail != word:
81-                if autoescape and not safe_input:
82-                    lead, trail = escape(lead), escape(trail)
83-                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
84-            elif autoescape and not safe_input:
85-                words[i] = escape(word)
86-        elif safe_input:
87-            words[i] = mark_safe(word)
88-        elif autoescape:
89-            words[i] = escape(word)
90-    return u''.join(words)
91+        text = mark_safe(text)
92+    # Check if there are any links in the text at all using Python's built-in
93+    # superlinear string search, and short-circuit the algorithm if there is
94+    # no text.
95+    if "http://" in text or "https://" in text or "www." in text:
96+        text = url_re.sub(curried_repl(trim_url_limit, nofollow), text)
97+    if "@" in text:
98+        text = email_re.sub(curried_repl(trim_url_limit, nofollow, email=True), text)
99+    return text
100 urlize = allow_lazy(urlize, unicode)
101 
102 def clean_html(text):