Django

Code

Ticket #5418: assert_no_broken_links.diff

File assert_no_broken_links.diff, 4.9 kB (added by absoludity, 3 years ago)

Initial thoughts for feedback.

  • django/test/testcases.py

    old new  
    1111 
    1212normalize_long_ints = lambda s: re.sub(r'(?<![\w])(\d+)L(?![\w])', '\\1', s) 
    1313 
     14# anchor_external_re matches the value of the href attribute for any 
     15# external anchor link (ie. where the value of the href attribute begins 
     16# with http:// or https://. 
     17anchor_external_re = re.compile( 
     18    r'<a' 
     19    r'.+?' # anything after the opening tag up to the href 
     20    r'href=(?:\'|")' # The opening of the href attribute 
     21    r'(' # group the value of the href attribute 
     22        r'https?://' # http:// or https:// 
     23        r'(?:' 
     24            r'(?:[A-Z0-9-]+\.)+[A-Z]{2,6}' # domain 
     25            r'|localhost' # or localhost 
     26            r'|127.0.0.1' # or loopback 
     27        r')' 
     28        r'(?::\d+)?' # optional port 
     29        r'(?:/?|/\S+)' # followed by either a / or nothing or /nonspacechars.html 
     30    r')' # finish matching at the end of the href value. 
     31    r'(?:\'|")', # The closing quote of the href attribute 
     32    re.IGNORECASE 
     33) 
     34 
     35# anchor_internal_re matches the value of the href attribute for any 
     36# internal anchor link (ie. where the value does not begin with http:// 
     37# or https://S 
     38anchor_internal_re = re.compile( 
     39    r'<a' 
     40    r'.+?' # anything after the opening tag up to the href 
     41    r'href=(?:\'|")' # The opening of the href attribute 
     42    r'(' # match the value of the href attribute 
     43    r'(?!https?://)' # make sure link doesn't start with http:// or https:// 
     44    r'(?:\S+)' # Match any non-whitespace 
     45    r')' # finish matching at the end of the href value. 
     46    r'(?:\'|")', # The closing quote of the href attribute 
     47    re.IGNORECASE 
     48) 
     49 
     50try: 
     51    from django.conf import settings 
     52    URL_VALIDATOR_USER_AGENT = settings.URL_VALIDATOR_USER_AGENT 
     53except (ImportError, EnvironmentError): 
     54    # It's OK if Django settings aren't configured. 
     55    URL_VALIDATOR_USER_AGENT = 'Django (http://www.djangoproject.com/)' 
     56 
    1457def to_list(value): 
    1558    """ 
    1659    Puts value into a list if it's not already one. 
     
    190233        self.failIf(template_name in template_names, 
    191234            (u"Template '%s' was used unexpectedly in rendering the" 
    192235             u" response") % template_name) 
     236 
     237    def assertNoBrokenLinks(self, response, internal_only=True): 
     238        """ 
     239        Asserts that all the links within the response, when followed, return 
     240        a valid page (a 200) or a redirect (302). 
     241         
     242        Current issues/thoughts: 
     243          * How to handle absolute paths to http://localhost:8000 for eg. 
     244          * The re's will also find page-internal links like #header 
     245          * Should we follow 302's to verify the page redirects to a 200 result? 
     246          * Do we need to include other protocols for external links  
     247            (currently anchor_external_re assumes href value begins with 
     248            http:// or https://, but what about ftp:// etc.?) 
     249        """ 
     250        non_broken_status_codes = (200, 301, 302, 304) 
     251         
     252        # Check the internal links first: 
     253        internal_links = anchor_internal_re.findall(response.content) 
     254                 
     255        for link in internal_links: 
     256            #print "Internal link: %s" % link 
     257            link_response = response.client.get(link) 
     258            #print "status: %s\n" % link_response.status_code 
     259            self.failUnless( 
     260                link_response.status_code in non_broken_status_codes, 
     261                (u"Link '%(link)s' returned a status of %(status)s") % { 
     262                    'link': link, 
     263                    'status': link_response.status_code                                                         
     264                }            
     265            ) 
     266 
     267        # Then check the external links 
     268        if not internal_only: 
     269            import urllib2 
     270            from django.conf import settings 
     271 
     272            external_links = anchor_external_re.findall(response.content) 
     273            headers = { 
     274                "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", 
     275                "Accept-Language": "en-us,en;q=0.5", 
     276                "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", 
     277                "Connection": "close", 
     278                "User-Agent": URL_VALIDATOR_USER_AGENT, 
     279            } 
     280 
     281            for link in external_links: 
     282                #print "External link: %s" % link 
     283                try: 
     284                    req = urllib2.Request(link, None, headers) 
     285                    u = urllib2.urlopen(req) 
     286                except ValueError: 
     287                    self.fail(u'The URL %s appears to be invalid.') 
     288                except: # urllib2.URLError, httplib.InvalidURL, etc. 
     289                    self.fail(u'This URL %s appears to be a broken link.' % link) 
     290                #print "Status: %s" % u.info().status