Parcourir la source

More code cleanups

RichardG867 il y a 3 ans
Parent
commit
5609a8b6da
1 fichiers modifiés avec 79 ajouts et 67 suppressions
  1. 79 67
      waybackproxy.py

+ 79 - 67
waybackproxy.py

@@ -2,49 +2,52 @@
 import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
 from config import *
 
-# internal LRU dictionary for preserving URLs on redirect
-date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
-
-# internal LRU dictionary for date availability
-availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None
-
 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
 	"""TCPServer with ThreadingMixIn added."""
 	pass
 
 class Handler(socketserver.BaseRequestHandler):
 	"""Main request handler."""
+
+	def __init__(self, *args, **kwargs):
+		super().__init__(*args, **kwargs)
+
+		# internal LRU dictionary for preserving URLs on redirect
+		self.date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
+
+		# internal LRU dictionary for date availability
+		self.availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None
+
 	def handle(self):
 		"""Handle a request."""
-		global availability_cache
-		
+
 		# readline is pretty convenient
 		f = self.request.makefile()
 		
 		# read request line
 		reqline = line = f.readline()
-		split = line.rstrip('\r\n').split(' ')
+		split = line.rstrip().split()
 		http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
-		
+
 		if split[0] != 'GET':
 			# only GET is implemented
 			return self.error_page(http_version, 501, 'Not Implemented')
-		
+
 		# read out the headers
 		request_host = None
 		pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
 		effective_date = DATE
 		auth = None
-		while line.rstrip('\r\n') != '':
+		while line.strip() != '':
 			line = f.readline()
 			ll = line.lower()
 			if ll[:6] == 'host: ':
-				pac_host = request_host = line[6:].rstrip('\r\n')
+				pac_host = request_host = line[6:].rstrip()
 				if ':' not in pac_host: # explicitly specify port if running on port 80
 					pac_host += ':80'
 			elif ll[:21] == 'x-waybackproxy-date: ':
 				# API for a personal project of mine
-				effective_date = line[21:].rstrip('\r\n')
+				effective_date = line[21:].rstrip()
 			elif ll[:21] == 'authorization: basic ':
 				# asset date code passed as username:password
 				auth = base64.b64decode(ll[21:])
@@ -62,19 +65,21 @@ class Handler(socketserver.BaseRequestHandler):
 			archived_url = split[1]
 		request_url = archived_url
 		parsed = urllib.parse.urlparse(request_url)
-		
+
 		# make a path
 		path = parsed.path
-		if parsed.query != '': path += '?' + parsed.query
-		if path == '': path == '/'
-		
+		if parsed.query:
+			path += '?' + parsed.query
+		elif path == '':
+			path == '/'
+
 		# get the hostname for later
 		host = parsed.netloc.split(':')
 		hostname = host[0]
 
 		# get cached date for redirects, if available
 		original_date = effective_date
-		effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
+		effective_date = self.date_cache.get(effective_date + '\x00' + archived_url, effective_date)
 
 		# get date from username:password, if available
 		if auth:
@@ -84,19 +89,19 @@ class Handler(socketserver.BaseRequestHandler):
 		try:
 			if path in pac_file_paths:
 				# PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled
-				pac  = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n'''
-				pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
-				pac += b'''\r\n'''
-				pac += b'''function FindProxyForURL(url, host)\r\n'''
-				pac += b'''{\r\n'''
-				if not availability_cache:
-					pac += b'''	if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
-					pac += b'''	{\r\n'''
-					pac += b'''		return "DIRECT";\r\n'''
-					pac += b'''	}\r\n'''
-				pac += b'''	return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n'''
-				pac += b'''}\r\n'''
-				self.request.sendall(pac)
+				pac  = http_version + ''' 200 OK\r\n'''
+				pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
+				pac += '''\r\n'''
+				pac += '''function FindProxyForURL(url, host)\r\n'''
+				pac += '''{\r\n'''
+				if not self.availability_cache:
+					pac += '''	if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
+					pac += '''	{\r\n'''
+					pac += '''		return "DIRECT";\r\n'''
+					pac += '''	}\r\n'''
+				pac += '''	return "PROXY ''' + pac_host + '''";\r\n'''
+				pac += '''}\r\n'''
+				self.request.sendall(pac.encode('ascii', 'ignore'))
 				return
 			elif hostname == 'web.archive.org':
 				if path[:5] != '/web/':
@@ -123,7 +128,7 @@ class Handler(socketserver.BaseRequestHandler):
 
 				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)				
 
-			if availability_cache is not None:
+			if self.availability_cache is not None:
 				# are we requesting from Wayback?
 				split = request_url.split('/')
 
@@ -138,7 +143,7 @@ class Handler(socketserver.BaseRequestHandler):
 
 					# check availability LRU cache
 					availability_url = '/'.join(split[5:])
-					new_url = availability_cache.get(availability_url, None)
+					new_url = self.availability_cache.get(availability_url, None)
 					if new_url:
 						# in cache => replace URL immediately
 						request_url = new_url
@@ -169,21 +174,18 @@ class Handler(socketserver.BaseRequestHandler):
 		except urllib.error.HTTPError as e:
 			# an error has been found
 
-			if e.code in (403, 404, 412):
-				# 403, 404 or tolerance exceeded => heuristically determine the static URL for some redirect scripts
+			if e.code in (403, 404, 412): # not found or tolerance exceeded
+				# heuristically determine the static URL for some redirect scripts
 				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))(?:%2F|/).+)''', archived_url, re.I)
 				if not match:
 					match = re.search('''(?:\\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.I)
-				if match:
-					# we found it
+				if match: # found it
 					new_url = urllib.parse.unquote_plus(match.group(1))
-					# add protocol if the URL is absolute but missing a protocol
-					if new_url[0] != '/' and '://' not in new_url:
+					if new_url[0] != '/' and '://' not in new_url: # add protocol if the URL is absolute but missing a protocol
 						new_url = 'http://' + new_url
 					_print('[r]', new_url)
 					return self.redirect_page(http_version, new_url)
-			elif e.code in (301, 302):
-				# 301 or 302 => urllib-generated error about an infinite redirect loop
+			elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop
 				_print('[!] Infinite redirect loop')
 				return self.error_page(http_version, 508, 'Infinite Redirect Loop')
 
@@ -196,16 +198,22 @@ class Handler(socketserver.BaseRequestHandler):
 				conn = e
 			else:
 				return self.error_page(http_version, e.code, e.reason)
-		
+
 		# get content type
 		content_type = conn.info().get('Content-Type')
-		if content_type == None: content_type = 'text/html'
-		if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
-		
+		if content_type == None:
+			content_type = 'text/html'
+		elif not CONTENT_TYPE_ENCODING:
+			idx = content_type.find(';')
+			if idx > -1:
+				content_type = content_type[:idx]
+
 		# set the mode: [0]wayback [1]oocities
-		mode = 0
-		if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
-		
+		if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
+			mode = 1
+		else:
+			mode = 0
+
 		# Wayback will add its HTML to anything it thinks is HTML
 		guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type')
 		if not guessed_content_type:
@@ -262,7 +270,8 @@ class Handler(socketserver.BaseRequestHandler):
 								return self.error_page(http_version, e.code, e.reason)
 
 						content_type = conn.info().get('Content-Type')
-						if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
+						if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1:
+							content_type = content_type[:content_type.find(';')]
 						data = conn.read()
 
 				if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
@@ -275,16 +284,16 @@ class Handler(socketserver.BaseRequestHandler):
 						except:
 							redirect_code = 302
 						archived_url = match.group(2).decode('ascii', 'ignore')
-						date_cache[effective_date + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
+						self.date_cache[effective_date + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
 						print('[r]', archived_url)
 						return self.redirect_page(http_version, archived_url, redirect_code)
 
 				# pre-toolbar scripts and CSS
-				data = re.sub(b'''<script src="//archive\\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->(?:\r)?\n''', b'', data, flags=re.S)
+				data = re.sub(b'''<script src="//archive\\.org/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, flags=re.S)
 				# toolbar
-				data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->(?:.*)<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, flags=re.S)
+				data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, flags=re.S)
 				# comments on footer
-				data = re.sub(b'''<!--(?:\r)?\n     FILE ARCHIVED (?:.*)$', b''', data, flags=re.S)
+				data = re.sub(b'''<!--\\r?\\n     FILE ARCHIVED .*$', b''', data, flags=re.S)
 				# fix base tag
 				data = re.sub(b'''(<base (?:[^>]*)href=(?:["\'])?)(?:(?:https?:)?//web.archive.org)?/web/(?:[^/]+)/''', b'\\1', data, flags=re.I + re.S)
 
@@ -304,24 +313,24 @@ class Handler(socketserver.BaseRequestHandler):
 					# sees an iframe pointing to an invalid URL.
 					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
 						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
-					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
+					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)/', b'', data) # non-asset
 				else:
 					# Remove asset URLs while simultaneously adding them to the
 					# LRU cache with their respective date.
 					def add_to_date_cache(match):
 						orig_url = match.group(2)
-						date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
+						self.date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
 						return orig_url
-					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\'#<>]+)', add_to_date_cache, data)
+					data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
 			elif mode == 1: # oocities
 				# viewport/cache-control/max-width code (header)
-				data = re.sub(b'''^(?:.*?)\n\n''', b'', data, flags=re.S)
+				data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
 				# archive notice and tracking code (footer)
-				data = re.sub(b'''<style> \n.zoomout { -webkit-transition: (?:.*)$''', b'', data, flags=re.S)
+				data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
 				# clearly labeled snippets from Geocities
-				data = re.sub(b'''^(?:.*)<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
-				data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->(?:.*)<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
-				data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->(?:.*)$''', b'', data, flags=re.S)
+				data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
+				data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
+				data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
 
 				# fix links
 				data = re.sub(b'''//([^.]*)\\.oocities\\.com/''', b'//\\1.geocities.com/', data, flags=re.S)
@@ -434,8 +443,8 @@ class Handler(socketserver.BaseRequestHandler):
 			
 			if 'date' in parsed and DATE != parsed['date'][0]:
 				DATE = parsed['date'][0]
-				date_cache.clear()
-				availability_cache.clear()
+				self.date_cache.clear()
+				self.availability_cache.clear()
 			if 'dateTolerance' in parsed and DATE_TOLERANCE != parsed['dateTolerance'][0]:
 				DATE_TOLERANCE = parsed['dateTolerance'][0]
 			GEOCITIES_FIX = 'gcFix' in parsed
@@ -452,11 +461,14 @@ class Handler(socketserver.BaseRequestHandler):
 		settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
 		settingspage += DATE_TOLERANCE
 		settingspage += '"> days<br><input type="checkbox" name="gcFix"'
-		if GEOCITIES_FIX: settingspage += ' checked'
+		if GEOCITIES_FIX:
+			settingspage += ' checked'
 		settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
-		if QUICK_IMAGES: settingspage += ' checked'
+		if QUICK_IMAGES:
+			settingspage += ' checked'
 		settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
-		if CONTENT_TYPE_ENCODING: settingspage += ' checked'
+		if CONTENT_TYPE_ENCODING:
+			settingspage += ' checked'
 		settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
 		self.request.send(settingspage.encode('utf8', 'ignore'))
 		self.request.close()