| 14 | # anchor_external_re matches the value of the href attribute for any |
| 15 | # external anchor link (ie. where the value of the href attribute begins |
| 16 | # with http:// or https://. |
| 17 | anchor_external_re = re.compile( |
| 18 | r'<a' |
| 19 | r'.+?' # anything after the opening tag up to the href |
| 20 | r'href=(?:\'|")' # The opening of the href attribute |
| 21 | r'(' # group the value of the href attribute |
| 22 | r'https?://' # http:// or https:// |
| 23 | r'(?:' |
| 24 | r'(?:[A-Z0-9-]+\.)+[A-Z]{2,6}' # domain |
| 25 | r'|localhost' # or localhost |
| 26 | r'|127.0.0.1' # or loopback |
| 27 | r')' |
| 28 | r'(?::\d+)?' # optional port |
| 29 | r'(?:/?|/\S+)' # followed by either a / or nothing or /nonspacechars.html |
| 30 | r')' # finish matching at the end of the href value. |
| 31 | r'(?:\'|")', # The closing quote of the href attribute |
| 32 | re.IGNORECASE |
| 33 | ) |
| 34 | |
| 35 | # anchor_internal_re matches the value of the href attribute for any |
| 36 | # internal anchor link (ie. where the value does not begin with http:// |
| 37 | # or https://S |
| 38 | anchor_internal_re = re.compile( |
| 39 | r'<a' |
| 40 | r'.+?' # anything after the opening tag up to the href |
| 41 | r'href=(?:\'|")' # The opening of the href attribute |
| 42 | r'(' # match the value of the href attribute |
| 43 | r'(?!https?://)' # make sure link doesn't start with http:// or https:// |
| 44 | r'(?:\S+)' # Match any non-whitespace |
| 45 | r')' # finish matching at the end of the href value. |
| 46 | r'(?:\'|")', # The closing quote of the href attribute |
| 47 | re.IGNORECASE |
| 48 | ) |
| 49 | |
| 50 | try: |
| 51 | from django.conf import settings |
| 52 | URL_VALIDATOR_USER_AGENT = settings.URL_VALIDATOR_USER_AGENT |
| 53 | except (ImportError, EnvironmentError): |
| 54 | # It's OK if Django settings aren't configured. |
| 55 | URL_VALIDATOR_USER_AGENT = 'Django (http://www.djangoproject.com/)' |
| 56 | |
| 236 | |
| 237 | def assertNoBrokenLinks(self, response, internal_only=True): |
| 238 | """ |
| 239 | Asserts that all the links within the response, when followed, return |
| 240 | a valid page (a 200) or a redirect (302). |
| 241 | |
| 242 | Current issues/thoughts: |
| 243 | * How to handle absolute paths to http://localhost:8000 for eg. |
| 244 | * The re's will also find page-internal links like #header |
| 245 | * Should we follow 302's to verify the page redirects to a 200 result? |
| 246 | * Do we need to include other protocols for external links |
| 247 | (currently anchor_external_re assumes href value begins with |
| 248 | http:// or https://, but what about ftp:// etc.?) |
| 249 | """ |
| 250 | non_broken_status_codes = (200, 301, 302, 304) |
| 251 | |
| 252 | # Check the internal links first: |
| 253 | internal_links = anchor_internal_re.findall(response.content) |
| 254 | |
| 255 | for link in internal_links: |
| 256 | #print "Internal link: %s" % link |
| 257 | link_response = response.client.get(link) |
| 258 | #print "status: %s\n" % link_response.status_code |
| 259 | self.failUnless( |
| 260 | link_response.status_code in non_broken_status_codes, |
| 261 | (u"Link '%(link)s' returned a status of %(status)s") % { |
| 262 | 'link': link, |
| 263 | 'status': link_response.status_code |
| 264 | } |
| 265 | ) |
| 266 | |
| 267 | # Then check the external links |
| 268 | if not internal_only: |
| 269 | import urllib2 |
| 270 | from django.conf import settings |
| 271 | |
| 272 | external_links = anchor_external_re.findall(response.content) |
| 273 | headers = { |
| 274 | "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", |
| 275 | "Accept-Language": "en-us,en;q=0.5", |
| 276 | "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", |
| 277 | "Connection": "close", |
| 278 | "User-Agent": URL_VALIDATOR_USER_AGENT, |
| 279 | } |
| 280 | |
| 281 | for link in external_links: |
| 282 | #print "External link: %s" % link |
| 283 | try: |
| 284 | req = urllib2.Request(link, None, headers) |
| 285 | u = urllib2.urlopen(req) |
| 286 | except ValueError: |
| 287 | self.fail(u'The URL %s appears to be invalid.') |
| 288 | except: # urllib2.URLError, httplib.InvalidURL, etc. |
| 289 | self.fail(u'This URL %s appears to be a broken link.' % link) |
| 290 | #print "Status: %s" % u.info().status |
| 291 | No newline at end of file |