2 năm trước cách đây · 7a9e4dd408
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -200,8 +200,44 @@ class Handler(socketserver.BaseRequestHandler):
 
				 						request_url = self.shared_state.availability_cache[availability_url] = new_url
			
 
				 
			
 
				 			# Start fetching the URL.
			
 
				-			retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=5, backoff_factor=1, raise_on_redirect=False)
			
 
				-			conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
			
 
				+			retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1)
			
 
				+			while True:
			
 
				+				conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False)
			
 
				+
			
 
				+				# Check for redirects.
			
 
				+				destination = conn.get_redirect_location()
			
 
				+				if destination:
			
 
				+					conn.drain_conn()
			
 
				+					conn.release_conn()
			
 
				+
			
 
				+					# Check if the redirect goes to a different Wayback URL.
			
 
				+					match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination)
			
 
				+					if match:
			
 
				+						archived_dest = match.group(2)
			
 
				+
			
 
				+						# Add missing protocol, just in case.
			
 
				+						split = archived_dest.split('/')
			
 
				+						if split[0][-1:] != ':':
			
 
				+							split = ['http:', ''] + split
			
 
				+
			
 
				+						# Remove extraneous :80 from URL.
			
 
				+						if split[2][-3:] == ':80':
			
 
				+							split[2] = split[2][:-3]
			
 
				+
			
 
				+						# Check if the archived URL is different.
			
 
				+						if archived_dest != archived_url:
			
 
				+							# Add destination to availability cache and redirect the client.
			
 
				+							_print('[r]', archived_dest)
			
 
				+							new_url = '/'.join(split)
			
 
				+							self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest
			
 
				+							return self.send_redirect_page(http_version, archived_dest, conn.status)
			
 
				+
			
 
				+					# Not an archived URL or same URL, redirect ourselves.
			
 
				+					request_url = destination
			
 
				+					continue
			
 
				+
			
 
				+				# Not a redirect, move on.
			
 
				+				break
			
 
				 		except urllib3.exceptions.MaxRetryError as e:
			
 
				 			_print('[!] Fetch retries exceeded:', e.reason)
			
 
				 			return self.send_error_page(http_version, 504, 'Gateway Timeout')
			
@@ -215,12 +251,13 @@ class Handler(socketserver.BaseRequestHandler):
 
				 		if conn.status != 200:
			
 
				 			if conn.status in (403, 404): # not found
			
 
				 				if self.guess_and_send_redirect(http_version, archived_url):
			
 
				+					conn.drain_conn()
			
 
				 					conn.release_conn()
			
 
				 					return
			
 
				-			elif conn.status in (301, 302): # urllib3-generated error about an infinite redirect loop
			
 
				-				conn.release_conn()
			
 
				-				_print('[!] Infinite redirect loop')
			
 
				-				return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
			
 
				+			#elif conn.status in (301, 302): # redirect loop detection currently unused
			
 
				+			#	conn.drain_conn()
			
 
				+			#	conn.release_conn()
			
 
				+			#	return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
			
 
				 
			
 
				 			if conn.status != 412: # tolerance exceeded has its own error message above
			
 
				 				_print('[!]', conn.status, conn.reason)
			
@@ -228,6 +265,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			# If the memento Link header is present, this is a website error
			
 
				 			# instead of a Wayback error. Pass it along if that's the case.
			
 
				 			if 'Link' not in conn.headers:
			
 
				+				conn.drain_conn()
			
 
				 				conn.release_conn()
			
 
				 				return self.send_error_page(http_version, conn.status, conn.reason)
			
 
				 
			
@@ -257,6 +295,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			# portion of the URL away if it ends up being HTML consumed
			
 
				 			# through the QUICK_IMAGES interface.
			
 
				 			if hostname == 'web.archive.org':
			
 
				+				conn.drain_conn()
			
 
				 				conn.release_conn()
			
 
				 				archived_url = '/'.join(request_url.split('/')[5:])
			
 
				 				_print('[r] [QI]', archived_url)
			
@@ -264,10 +303,11 @@ class Handler(socketserver.BaseRequestHandler):
 
				 
			
 
				 			# Check if the date is within tolerance.
			
 
				 			if DATE_TOLERANCE != None:
			
 
				-				match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl() or '')
			
 
				+				match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '')
			
 
				 				if match:
			
 
				 					requested_date = match.group(1)
			
 
				 					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
			
 
				+						conn.drain_conn()
			
 
				 						conn.release_conn()
			
 
				 						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
			
 
				 						if not self.guess_and_send_redirect(http_version, archived_url):
			
@@ -307,6 +347,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 							# If the memento Link header is present, this is a website error
			
 
				 							# instead of a Wayback error. Pass it along if that's the case.
			
 
				 							if 'Link' not in conn.headers:
			
 
				+								conn.drain_conn()
			
 
				 								conn.release_conn()
			
 
				 								return self.send_error_page(http_version, conn.status, conn.reason)