Ticket #5418: assert_no_broken_links.diff

File assert_no_broken_links.diff, 4.9 KB (added by absoludity, 8 years ago)

Initial thoughts for feedback.

  • django/test/testcases.py

     
    1111
    1212normalize_long_ints = lambda s: re.sub(r'(?<![\w])(\d+)L(?![\w])', '\\1', s)
    1313
     14# anchor_external_re matches the value of the href attribute for any
     15# external anchor link (ie. where the value of the href attribute begins
     16# with http:// or https://.
     17anchor_external_re = re.compile(
     18    r'<a'
     19    r'.+?' # anything after the opening tag up to the href
     20    r'href=(?:\'|")' # The opening of the href attribute
     21    r'(' # group the value of the href attribute
     22        r'https?://' # http:// or https://
     23        r'(?:'
     24            r'(?:[A-Z0-9-]+\.)+[A-Z]{2,6}' # domain
     25            r'|localhost' # or localhost
     26            r'|127.0.0.1' # or loopback
     27        r')'
     28        r'(?::\d+)?' # optional port
     29        r'(?:/?|/\S+)' # followed by either a / or nothing or /nonspacechars.html
     30    r')' # finish matching at the end of the href value.
     31    r'(?:\'|")', # The closing quote of the href attribute
     32    re.IGNORECASE
     33)
     34
     35# anchor_internal_re matches the value of the href attribute for any
     36# internal anchor link (ie. where the value does not begin with http://
     37# or https://S
     38anchor_internal_re = re.compile(
     39    r'<a'
     40    r'.+?' # anything after the opening tag up to the href
     41    r'href=(?:\'|")' # The opening of the href attribute
     42    r'(' # match the value of the href attribute
     43    r'(?!https?://)' # make sure link doesn't start with http:// or https://
     44    r'(?:\S+)' # Match any non-whitespace
     45    r')' # finish matching at the end of the href value.
     46    r'(?:\'|")', # The closing quote of the href attribute
     47    re.IGNORECASE
     48)
     49
     50try:
     51    from django.conf import settings
     52    URL_VALIDATOR_USER_AGENT = settings.URL_VALIDATOR_USER_AGENT
     53except (ImportError, EnvironmentError):
     54    # It's OK if Django settings aren't configured.
     55    URL_VALIDATOR_USER_AGENT = 'Django (http://www.djangoproject.com/)'
     56
    1457def to_list(value):
    1558    """
    1659    Puts value into a list if it's not already one.
     
    190233        self.failIf(template_name in template_names,
    191234            (u"Template '%s' was used unexpectedly in rendering the"
    192235             u" response") % template_name)
     236
     237    def assertNoBrokenLinks(self, response, internal_only=True):
     238        """
     239        Asserts that all the links within the response, when followed, return
     240        a valid page (a 200) or a redirect (302).
     241       
     242        Current issues/thoughts:
     243          * How to handle absolute paths to http://localhost:8000 for eg.
     244          * The re's will also find page-internal links like #header
     245          * Should we follow 302's to verify the page redirects to a 200 result?
     246          * Do we need to include other protocols for external links
     247            (currently anchor_external_re assumes href value begins with
     248            http:// or https://, but what about ftp:// etc.?)
     249        """
     250        non_broken_status_codes = (200, 301, 302, 304)
     251       
     252        # Check the internal links first:
     253        internal_links = anchor_internal_re.findall(response.content)
     254               
     255        for link in internal_links:
     256            #print "Internal link: %s" % link
     257            link_response = response.client.get(link)
     258            #print "status: %s\n" % link_response.status_code
     259            self.failUnless(
     260                link_response.status_code in non_broken_status_codes,
     261                (u"Link '%(link)s' returned a status of %(status)s") % {
     262                    'link': link,
     263                    'status': link_response.status_code                                                       
     264                }           
     265            )
     266
     267        # Then check the external links
     268        if not internal_only:
     269            import urllib2
     270            from django.conf import settings
     271
     272            external_links = anchor_external_re.findall(response.content)
     273            headers = {
     274                "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
     275                "Accept-Language": "en-us,en;q=0.5",
     276                "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
     277                "Connection": "close",
     278                "User-Agent": URL_VALIDATOR_USER_AGENT,
     279            }
     280
     281            for link in external_links:
     282                #print "External link: %s" % link
     283                try:
     284                    req = urllib2.Request(link, None, headers)
     285                    u = urllib2.urlopen(req)
     286                except ValueError:
     287                    self.fail(u'The URL %s appears to be invalid.')
     288                except: # urllib2.URLError, httplib.InvalidURL, etc.
     289                    self.fail(u'This URL %s appears to be a broken link.' % link)
     290                #print "Status: %s" % u.info().status
     291 No newline at end of file
Back to Top