3 éve · 0fd376a6f8
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -1,5 +1,5 @@
 
				 #!/usr/bin/env python3
			
 
				-import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
			
 
				+import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, traceback, urllib.request, urllib.error, urllib.parse
			
 
				 from config import *
			
 
				 
			
 
				 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
			
@@ -183,18 +183,23 @@ class Handler(socketserver.BaseRequestHandler):
 
				 
			
 
				 			conn = urllib.request.urlopen(request_url)
			
 
				 		except urllib.error.HTTPError as e:
			
 
				-			# an error has been found
			
 
				+			# An HTTP error has been found.
			
 
				 
			
 
				 			if e.code in (403, 404, 412): # not found or tolerance exceeded
			
 
				-				# heuristically determine the static URL for some redirect scripts
			
 
				-				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))(?:%2F|/).+)''', archived_url, re.I)
			
 
				+				# Heuristically determine the static URL for some redirect scripts.
			
 
				+				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*)\\.[^/%]+)(?:%2F|/).+)''', archived_url, re.I) # URL in path
			
 
				 				if not match:
			
 
				-					match = re.search('''(?:\\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.I)
			
 
				-				if match: # found it
			
 
				+					match = re.search('''[\\?&][^=]+=((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', archived_url, re.I) # URL in query string
			
 
				+				if match: # found URL
			
 
				+					# Decode the URL.
			
 
				 					new_url = urllib.parse.unquote_plus(match.group(1))
			
 
				-					if new_url[0] != '/' and '://' not in new_url: # add protocol if the URL is absolute but missing a protocol
			
 
				+
			
 
				+					# Add protocol if the URL is absolute but missing a protocol.
			
 
				+					if new_url[0] != '/' and '://' not in new_url:
			
 
				 						new_url = 'http://' + new_url
			
 
				-					_print('[r]', new_url)
			
 
				+
			
 
				+					# Redirect client to the URL.
			
 
				+					_print('[r] [g]', new_url)
			
 
				 					return self.send_redirect_page(http_version, new_url)
			
 
				 			elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop
			
 
				 				_print('[!] Infinite redirect loop')
			
@@ -210,10 +215,11 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			else:
			
 
				 				return self.send_error_page(http_version, e.code, e.reason)
			
 
				 		except socket.timeout as e:
			
 
				-			_print('Timeout')
			
 
				+			_print('[!] Timeout')
			
 
				 		except:
			
 
				-			_print('Generic exception')
			
 
				-		
			
 
				+			_print('[!] Generic exception:')
			
 
				+			traceback.print_exc()
			
 
				+
			
 
				 		# get content type
			
 
				 		content_type = conn.info().get('Content-Type')
			
 
				 		if content_type == None:
			
@@ -383,31 +389,31 @@ class Handler(socketserver.BaseRequestHandler):
 
				 
			
 
				 		response = http_version
			
 
				 
			
 
				-		# pass the error code if there is one
			
 
				+		# Pass the error code if there is one.
			
 
				 		if isinstance(conn, urllib.error.HTTPError):
			
 
				 			response += '{0} {1}'.format(conn.code, conn.reason.replace('\n', ' '))
			
 
				 		else:
			
 
				 			response += '200 OK'
			
 
				 
			
 
				-		# add content type, and the ETag for caching
			
 
				+		# Add content type, and the ETag for caching.
			
 
				 		response += '\r\nContent-Type: ' + content_type + '\r\nETag: "' + request_url.replace('"', '') + '"\r\n'
			
 
				 
			
 
				-		# add X-Archive-Orig-* headers
			
 
				+		# Add X-Archive-Orig-* headers.
			
 
				 		headers = conn.info()
			
 
				 		for header in headers:
			
 
				 			if header.find('X-Archive-Orig-') == 0:
			
 
				 				orig_header = header[15:]
			
 
				-				# blacklist certain headers which may alter the client
			
 
				+				# Blacklist certain headers which may affect client behavior.
			
 
				 				if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
			
 
				 					response += orig_header + ': ' + headers[header] + '\r\n'
			
 
				 
			
 
				-		# finish and send the request
			
 
				+		# Finish and send the request.
			
 
				 		response += '\r\n'
			
 
				 		self.request.sendall(response.encode('ascii', 'ignore'))
			
 
				 	
			
 
				 	def send_error_page(self, http_version, code, reason):
			
 
				 		"""Generate an error page."""
			
 
				-		
			
 
				+
			
 
				 		# make error page
			
 
				 		errorpage  = '<html><head><title>{0} {1}</title>'.format(code, reason)
			
 
				 		# IE's same-origin policy throws "Access is denied." inside frames
			
@@ -418,7 +424,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 		errorpage += '<script language="javascript1.3">if (window.screenLeft != null) { eval(\'try { var frameElement = window.frameElement; } catch (e) { document.location.href = "about:blank"; }\'); }</script>'
			
 
				 		errorpage += '<script language="javascript">if (window.self != window.top && !(window.frameElement && window.frameElement.tagName == "FRAME")) { document.location.href = "about:blank"; }</script>'
			
 
				 		errorpage += '</head><body><h1>{0}</h1><p>'.format(reason)
			
 
				-		
			
 
				+
			
 
				 		# add code information
			
 
				 		if code in (404, 508): # page not archived or redirect loop
			
 
				 			errorpage += 'This page may not be archived by the Wayback Machine.'