Forráskód Böngészése

Clean redirect heuristics regexes

RichardG867 3 éve
szülő
commit
0fd376a6f8
1 módosított fájl, 24 hozzáadás és 18 törlés
  1. 24 18
      waybackproxy.py

+ 24 - 18
waybackproxy.py

@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
+import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, traceback, urllib.request, urllib.error, urllib.parse
 from config import *
 
 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
@@ -183,18 +183,23 @@ class Handler(socketserver.BaseRequestHandler):
 
 			conn = urllib.request.urlopen(request_url)
 		except urllib.error.HTTPError as e:
-			# an error has been found
+			# An HTTP error has been found.
 
 			if e.code in (403, 404, 412): # not found or tolerance exceeded
-				# heuristically determine the static URL for some redirect scripts
-				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))(?:%2F|/).+)''', archived_url, re.I)
+				# Heuristically determine the static URL for some redirect scripts.
+				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*)\\.[^/%]+)(?:%2F|/).+)''', archived_url, re.I) # URL in path
 				if not match:
-					match = re.search('''(?:\\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.I)
-				if match: # found it
+					match = re.search('''[\\?&][^=]+=((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', archived_url, re.I) # URL in query string
+				if match: # found URL
+					# Decode the URL.
 					new_url = urllib.parse.unquote_plus(match.group(1))
-					if new_url[0] != '/' and '://' not in new_url: # add protocol if the URL is absolute but missing a protocol
+
+					# Add protocol if the URL is absolute but missing a protocol.
+					if new_url[0] != '/' and '://' not in new_url:
 						new_url = 'http://' + new_url
-					_print('[r]', new_url)
+
+					# Redirect client to the URL.
+					_print('[r] [g]', new_url)
 					return self.send_redirect_page(http_version, new_url)
 			elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop
 				_print('[!] Infinite redirect loop')
@@ -210,10 +215,11 @@ class Handler(socketserver.BaseRequestHandler):
 			else:
 				return self.send_error_page(http_version, e.code, e.reason)
 		except socket.timeout as e:
-			_print('Timeout')
+			_print('[!] Timeout')
 		except:
-			_print('Generic exception')
-		
+			_print('[!] Generic exception:')
+			traceback.print_exc()
+
 		# get content type
 		content_type = conn.info().get('Content-Type')
 		if content_type == None:
@@ -383,31 +389,31 @@ class Handler(socketserver.BaseRequestHandler):
 
 		response = http_version
 
-		# pass the error code if there is one
+		# Pass the error code if there is one.
 		if isinstance(conn, urllib.error.HTTPError):
 			response += '{0} {1}'.format(conn.code, conn.reason.replace('\n', ' '))
 		else:
 			response += '200 OK'
 
-		# add content type, and the ETag for caching
+		# Add content type, and the ETag for caching.
 		response += '\r\nContent-Type: ' + content_type + '\r\nETag: "' + request_url.replace('"', '') + '"\r\n'
 
-		# add X-Archive-Orig-* headers
+		# Add X-Archive-Orig-* headers.
 		headers = conn.info()
 		for header in headers:
 			if header.find('X-Archive-Orig-') == 0:
 				orig_header = header[15:]
-				# blacklist certain headers which may alter the client
+				# Blacklist certain headers which may affect client behavior.
 				if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
 					response += orig_header + ': ' + headers[header] + '\r\n'
 
-		# finish and send the request
+		# Finish and send the request.
 		response += '\r\n'
 		self.request.sendall(response.encode('ascii', 'ignore'))
 	
 	def send_error_page(self, http_version, code, reason):
 		"""Generate an error page."""
-		
+
 		# make error page
 		errorpage  = '<html><head><title>{0} {1}</title>'.format(code, reason)
 		# IE's same-origin policy throws "Access is denied." inside frames
@@ -418,7 +424,7 @@ class Handler(socketserver.BaseRequestHandler):
 		errorpage += '<script language="javascript1.3">if (window.screenLeft != null) { eval(\'try { var frameElement = window.frameElement; } catch (e) { document.location.href = "about:blank"; }\'); }</script>'
 		errorpage += '<script language="javascript">if (window.self != window.top && !(window.frameElement && window.frameElement.tagName == "FRAME")) { document.location.href = "about:blank"; }</script>'
 		errorpage += '</head><body><h1>{0}</h1><p>'.format(reason)
-		
+
 		# add code information
 		if code in (404, 508): # page not archived or redirect loop
 			errorpage += 'This page may not be archived by the Wayback Machine.'