Browse Source

Proxy-level handling for URL redirection + connection drainage fix

RichardG867 2 năm trước cách đây
mục cha
commit
7a9e4dd408
1 tập tin đã thay đổi với 48 bổ sung7 xóa
  1. 48 7
      waybackproxy.py

+ 48 - 7
waybackproxy.py

@@ -200,8 +200,44 @@ class Handler(socketserver.BaseRequestHandler):
 						request_url = self.shared_state.availability_cache[availability_url] = new_url
 
 			# Start fetching the URL.
-			retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=5, backoff_factor=1, raise_on_redirect=False)
-			conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
+			retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1)
+			while True:
+				conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False)
+
+				# Check for redirects.
+				destination = conn.get_redirect_location()
+				if destination:
+					conn.drain_conn()
+					conn.release_conn()
+
+					# Check if the redirect goes to a different Wayback URL.
+					match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination)
+					if match:
+						archived_dest = match.group(2)
+
+						# Add missing protocol, just in case.
+						split = archived_dest.split('/')
+						if split[0][-1:] != ':':
+							split = ['http:', ''] + split
+
+						# Remove extraneous :80 from URL.
+						if split[2][-3:] == ':80':
+							split[2] = split[2][:-3]
+
+						# Check if the archived URL is different.
+						if archived_dest != archived_url:
+							# Add destination to availability cache and redirect the client.
+							_print('[r]', archived_dest)
+							new_url = '/'.join(split)
+							self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest
+							return self.send_redirect_page(http_version, archived_dest, conn.status)
+
+					# Not an archived URL or same URL, redirect ourselves.
+					request_url = destination
+					continue
+
+				# Not a redirect, move on.
+				break
 		except urllib3.exceptions.MaxRetryError as e:
 			_print('[!] Fetch retries exceeded:', e.reason)
 			return self.send_error_page(http_version, 504, 'Gateway Timeout')
@@ -215,12 +251,13 @@ class Handler(socketserver.BaseRequestHandler):
 		if conn.status != 200:
 			if conn.status in (403, 404): # not found
 				if self.guess_and_send_redirect(http_version, archived_url):
+					conn.drain_conn()
 					conn.release_conn()
 					return
-			elif conn.status in (301, 302): # urllib3-generated error about an infinite redirect loop
-				conn.release_conn()
-				_print('[!] Infinite redirect loop')
-				return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
+			#elif conn.status in (301, 302): # redirect loop detection currently unused
+			#	conn.drain_conn()
+			#	conn.release_conn()
+			#	return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
 
 			if conn.status != 412: # tolerance exceeded has its own error message above
 				_print('[!]', conn.status, conn.reason)
@@ -228,6 +265,7 @@ class Handler(socketserver.BaseRequestHandler):
 			# If the memento Link header is present, this is a website error
 			# instead of a Wayback error. Pass it along if that's the case.
 			if 'Link' not in conn.headers:
+				conn.drain_conn()
 				conn.release_conn()
 				return self.send_error_page(http_version, conn.status, conn.reason)
 
@@ -257,6 +295,7 @@ class Handler(socketserver.BaseRequestHandler):
 			# portion of the URL away if it ends up being HTML consumed
 			# through the QUICK_IMAGES interface.
 			if hostname == 'web.archive.org':
+				conn.drain_conn()
 				conn.release_conn()
 				archived_url = '/'.join(request_url.split('/')[5:])
 				_print('[r] [QI]', archived_url)
@@ -264,10 +303,11 @@ class Handler(socketserver.BaseRequestHandler):
 
 			# Check if the date is within tolerance.
 			if DATE_TOLERANCE != None:
-				match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl() or '')
+				match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '')
 				if match:
 					requested_date = match.group(1)
 					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
+						conn.drain_conn()
 						conn.release_conn()
 						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
 						if not self.guess_and_send_redirect(http_version, archived_url):
@@ -307,6 +347,7 @@ class Handler(socketserver.BaseRequestHandler):
 							# If the memento Link header is present, this is a website error
 							# instead of a Wayback error. Pass it along if that's the case.
 							if 'Link' not in conn.headers:
+								conn.drain_conn()
 								conn.release_conn()
 								return self.send_error_page(http_version, conn.status, conn.reason)