Forráskód Böngészése

Add support for "excluded from the Wayback Machine" error

Test case: www.tomshardware.com
RichardG867 5 éve
szülő
commit
9dfd5cf95b
1 módosított fájl, 6 hozzáadás és 2 törlés
  1. 6 2
      waybackproxy.py

+ 6 - 2
waybackproxy.py

@@ -236,6 +236,10 @@ class Handler(socketserver.BaseRequestHandler):
 			# patch the page
 			if mode == 0: # wayback
 				if b'<title>Wayback Machine</title>' in data:
+					if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data:
+						# exclusion error (robots.txt?)
+						return self.error_page(http_version, 403, 'URL excluded')
+
 					match = re.search(b'<iframe id="playback" src="((?:(?:http(?:s)?:)?//web.archive.org)?/web/[^"]+)"', data)
 					if match:
 						# media playback iframe
@@ -380,8 +384,8 @@ class Handler(socketserver.BaseRequestHandler):
 		# add code information
 		if code in (404, 508): # page not archived or redirect loop
 			errorpage += 'This page may not be archived by the Wayback Machine.'
-		elif code == 403: # not crawled due to robots.txt
-			errorpage += 'This page was not archived due to a robots.txt block.'
+		elif code == 403: # not crawled due to exclusion
+			errorpage += 'This page was not archived due to a Wayback Machine exclusion.'
 		elif code == 501: # method not implemented
 			errorpage += 'WaybackProxy only implements the GET method.'
 		elif code == 412: # outside of tolerance