| | 14 | # anchor_external_re matches the value of the href attribute for any |
| | 15 | # external anchor link (ie. where the value of the href attribute begins |
| | 16 | # with http:// or https://. |
| | 17 | anchor_external_re = re.compile( |
| | 18 | r'<a' |
| | 19 | r'.+?' # anything after the opening tag up to the href |
| | 20 | r'href=(?:\'|")' # The opening of the href attribute |
| | 21 | r'(' # group the value of the href attribute |
| | 22 | r'https?://' # http:// or https:// |
| | 23 | r'(?:' |
| | 24 | r'(?:[A-Z0-9-]+\.)+[A-Z]{2,6}' # domain |
| | 25 | r'|localhost' # or localhost |
| | 26 | r'|127.0.0.1' # or loopback |
| | 27 | r')' |
| | 28 | r'(?::\d+)?' # optional port |
| | 29 | r'(?:/?|/\S+)' # followed by either a / or nothing or /nonspacechars.html |
| | 30 | r')' # finish matching at the end of the href value. |
| | 31 | r'(?:\'|")', # The closing quote of the href attribute |
| | 32 | re.IGNORECASE |
| | 33 | ) |
| | 34 | |
| | 35 | # anchor_internal_re matches the value of the href attribute for any |
| | 36 | # internal anchor link (ie. where the value does not begin with http:// |
| | 37 | # or https://S |
| | 38 | anchor_internal_re = re.compile( |
| | 39 | r'<a' |
| | 40 | r'.+?' # anything after the opening tag up to the href |
| | 41 | r'href=(?:\'|")' # The opening of the href attribute |
| | 42 | r'(' # match the value of the href attribute |
| | 43 | r'(?!https?://)' # make sure link doesn't start with http:// or https:// |
| | 44 | r'(?:\S+)' # Match any non-whitespace |
| | 45 | r')' # finish matching at the end of the href value. |
| | 46 | r'(?:\'|")', # The closing quote of the href attribute |
| | 47 | re.IGNORECASE |
| | 48 | ) |
| | 49 | |
| | 50 | try: |
| | 51 | from django.conf import settings |
| | 52 | URL_VALIDATOR_USER_AGENT = settings.URL_VALIDATOR_USER_AGENT |
| | 53 | except (ImportError, EnvironmentError): |
| | 54 | # It's OK if Django settings aren't configured. |
| | 55 | URL_VALIDATOR_USER_AGENT = 'Django (http://www.djangoproject.com/)' |
| | 56 | |
| | 236 | |
| | 237 | def assertNoBrokenLinks(self, response, internal_only=True): |
| | 238 | """ |
| | 239 | Asserts that all the links within the response, when followed, return |
| | 240 | a valid page (a 200) or a redirect (302). |
| | 241 | |
| | 242 | Current issues/thoughts: |
| | 243 | * How to handle absolute paths to http://localhost:8000 for eg. |
| | 244 | * The re's will also find page-internal links like #header |
| | 245 | * Should we follow 302's to verify the page redirects to a 200 result? |
| | 246 | * Do we need to include other protocols for external links |
| | 247 | (currently anchor_external_re assumes href value begins with |
| | 248 | http:// or https://, but what about ftp:// etc.?) |
| | 249 | """ |
| | 250 | non_broken_status_codes = (200, 301, 302, 304) |
| | 251 | |
| | 252 | # Check the internal links first: |
| | 253 | internal_links = anchor_internal_re.findall(response.content) |
| | 254 | |
| | 255 | for link in internal_links: |
| | 256 | #print "Internal link: %s" % link |
| | 257 | link_response = response.client.get(link) |
| | 258 | #print "status: %s\n" % link_response.status_code |
| | 259 | self.failUnless( |
| | 260 | link_response.status_code in non_broken_status_codes, |
| | 261 | (u"Link '%(link)s' returned a status of %(status)s") % { |
| | 262 | 'link': link, |
| | 263 | 'status': link_response.status_code |
| | 264 | } |
| | 265 | ) |
| | 266 | |
| | 267 | # Then check the external links |
| | 268 | if not internal_only: |
| | 269 | import urllib2 |
| | 270 | from django.conf import settings |
| | 271 | |
| | 272 | external_links = anchor_external_re.findall(response.content) |
| | 273 | headers = { |
| | 274 | "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", |
| | 275 | "Accept-Language": "en-us,en;q=0.5", |
| | 276 | "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", |
| | 277 | "Connection": "close", |
| | 278 | "User-Agent": URL_VALIDATOR_USER_AGENT, |
| | 279 | } |
| | 280 | |
| | 281 | for link in external_links: |
| | 282 | #print "External link: %s" % link |
| | 283 | try: |
| | 284 | req = urllib2.Request(link, None, headers) |
| | 285 | u = urllib2.urlopen(req) |
| | 286 | except ValueError: |
| | 287 | self.fail(u'The URL %s appears to be invalid.') |
| | 288 | except: # urllib2.URLError, httplib.InvalidURL, etc. |
| | 289 | self.fail(u'This URL %s appears to be a broken link.' % link) |
| | 290 | #print "Status: %s" % u.info().status |
| | 291 | No newline at end of file |