Code

Ticket #5418: assert_no_broken_links.diff

File assert_no_broken_links.diff, 4.9 KB (added by absoludity, 7 years ago)

Initial thoughts for feedback.

Line 
1Index: django/test/testcases.py
2===================================================================
3--- django/test/testcases.py    (revision 6122)
4+++ django/test/testcases.py    (working copy)
5@@ -11,6 +11,49 @@
6 
7 normalize_long_ints = lambda s: re.sub(r'(?<![\w])(\d+)L(?![\w])', '\\1', s)
8 
9+# anchor_external_re matches the value of the href attribute for any
10+# external anchor link (ie. where the value of the href attribute begins
11+# with http:// or https://.
12+anchor_external_re = re.compile(
13+    r'<a'
14+    r'.+?' # anything after the opening tag up to the href
15+    r'href=(?:\'|")' # The opening of the href attribute
16+    r'(' # group the value of the href attribute
17+        r'https?://' # http:// or https://
18+        r'(?:'
19+            r'(?:[A-Z0-9-]+\.)+[A-Z]{2,6}' # domain
20+            r'|localhost' # or localhost
21+            r'|127.0.0.1' # or loopback
22+        r')'
23+        r'(?::\d+)?' # optional port
24+        r'(?:/?|/\S+)' # followed by either a / or nothing or /nonspacechars.html
25+    r')' # finish matching at the end of the href value.
26+    r'(?:\'|")', # The closing quote of the href attribute
27+    re.IGNORECASE
28+)
29+
30+# anchor_internal_re matches the value of the href attribute for any
31+# internal anchor link (ie. where the value does not begin with http://
32+# or https://S
33+anchor_internal_re = re.compile(
34+    r'<a'
35+    r'.+?' # anything after the opening tag up to the href
36+    r'href=(?:\'|")' # The opening of the href attribute
37+    r'(' # match the value of the href attribute
38+    r'(?!https?://)' # make sure link doesn't start with http:// or https://
39+    r'(?:\S+)' # Match any non-whitespace
40+    r')' # finish matching at the end of the href value.
41+    r'(?:\'|")', # The closing quote of the href attribute
42+    re.IGNORECASE
43+)
44+
45+try:
46+    from django.conf import settings
47+    URL_VALIDATOR_USER_AGENT = settings.URL_VALIDATOR_USER_AGENT
48+except (ImportError, EnvironmentError):
49+    # It's OK if Django settings aren't configured.
50+    URL_VALIDATOR_USER_AGENT = 'Django (http://www.djangoproject.com/)'
51+
52 def to_list(value):
53     """
54     Puts value into a list if it's not already one.
55@@ -190,3 +233,58 @@
56         self.failIf(template_name in template_names,
57             (u"Template '%s' was used unexpectedly in rendering the"
58              u" response") % template_name)
59+
60+    def assertNoBrokenLinks(self, response, internal_only=True):
61+        """
62+        Asserts that all the links within the response, when followed, return
63+        a valid page (a 200) or a redirect (302).
64+       
65+        Current issues/thoughts:
66+          * How to handle absolute paths to http://localhost:8000 for eg.
67+          * The re's will also find page-internal links like #header
68+          * Should we follow 302's to verify the page redirects to a 200 result?
69+          * Do we need to include other protocols for external links
70+            (currently anchor_external_re assumes href value begins with
71+            http:// or https://, but what about ftp:// etc.?)
72+        """
73+        non_broken_status_codes = (200, 301, 302, 304)
74+       
75+        # Check the internal links first:
76+        internal_links = anchor_internal_re.findall(response.content)
77+               
78+        for link in internal_links:
79+            #print "Internal link: %s" % link
80+            link_response = response.client.get(link)
81+            #print "status: %s\n" % link_response.status_code
82+            self.failUnless(
83+                link_response.status_code in non_broken_status_codes,
84+                (u"Link '%(link)s' returned a status of %(status)s") % {
85+                    'link': link,
86+                    'status': link_response.status_code                                                       
87+                }           
88+            )
89+
90+        # Then check the external links
91+        if not internal_only:
92+            import urllib2
93+            from django.conf import settings
94+
95+            external_links = anchor_external_re.findall(response.content)
96+            headers = {
97+                "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
98+                "Accept-Language": "en-us,en;q=0.5",
99+                "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
100+                "Connection": "close",
101+                "User-Agent": URL_VALIDATOR_USER_AGENT,
102+            }
103+
104+            for link in external_links:
105+                #print "External link: %s" % link
106+                try:
107+                    req = urllib2.Request(link, None, headers)
108+                    u = urllib2.urlopen(req)
109+                except ValueError:
110+                    self.fail(u'The URL %s appears to be invalid.')
111+                except: # urllib2.URLError, httplib.InvalidURL, etc.
112+                    self.fail(u'This URL %s appears to be a broken link.' % link)
113+                #print "Status: %s" % u.info().status
114\ No newline at end of file