Просмотр исходного кода

Strip HTTPS URLs + further code cleanups

RichardG867 3 лет назад
Родитель
Сommit
09e57f66d4
1 измененных файлов с 90 добавлено и 62 удалено
  1. 90 62
      waybackproxy.py

+ 90 - 62
waybackproxy.py

@@ -95,10 +95,10 @@ class Handler(socketserver.BaseRequestHandler):
 		if auth:
 			effective_date = auth.replace(':', '')
 
-		# effectively handle the request
+		# Effectively handle the request.
 		try:
 			if path in pac_file_paths:
-				# PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled
+				# PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled.
 				pac  = http_version + ''' 200 OK\r\n'''
 				pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
 				pac += '''\r\n'''
@@ -127,39 +127,40 @@ class Handler(socketserver.BaseRequestHandler):
 					archived_url = '/'.join(split[5:])
 					_print('[>] [QI]', archived_url)
 			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
-				# apply GEOCITIES_FIX and pass it through
+				# Apply GEOCITIES_FIX and pass it through.
 				_print('[>]', archived_url)
 
 				split = archived_url.split('/')
 				hostname = split[2] = 'www.oocities.org'
 				request_url = '/'.join(split)
 			else:
-				# get from Wayback
+				# Get from the Wayback Machine.
 				_print('[>]', archived_url)
 
 				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)				
 
+			# Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors.
 			if self.shared_state.availability_cache != None:
-				# are we requesting from Wayback?
+				# Are we requesting from the Wayback Machine?
 				split = request_url.split('/')
 
-				# if so, get the closest available date from Wayback's API, to avoid archived 404 pages and other site errors
+				# If so, get the closest available date from the API.
 				if split[2] == 'web.archive.org':
-					# remove extraneous :80 from URL
+					# Remove extraneous :80 from URL.
 					if ':' in split[5]:
 						if split[7][-3:] == ':80':
 							split[7] = split[7][:-3]
 					elif split[5][-3:] == ':80':
 						split[5] = split[5][:-3]
 
-					# check availability LRU cache
+					# Check availability LRU cache.
 					availability_url = '/'.join(split[5:])
 					new_url = self.shared_state.availability_cache.get(availability_url, None)
 					if new_url:
-						# in cache => replace URL immediately
+						# In cache => replace URL immediately.
 						request_url = new_url
 					else:
-						# not in cache => contact API
+						# Not in cache => contact API.
 						try:
 							availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14], timeout=10).read())
 							closest = availability.get('archived_snapshots', {}).get('closest', {})
@@ -169,34 +170,30 @@ class Handler(socketserver.BaseRequestHandler):
 							new_date = None
 
 						if new_date and new_date != effective_date[:14]:
-							# returned date is different
+							# Returned date is different.
 							new_url = closest['url']
 
-							# add asset tag if one is present in the original URL
+							# Add asset tag if one is present in the original URL.
 							if len(effective_date) > 14:
 								split = new_url.split('/')
 								split[4] += effective_date[14:]
 								new_url = '/'.join(split)
 
-							# replace URL and add it to the availability cache
+							# Replace URL and add it to the availability cache.
 							request_url = self.shared_state.availability_cache[availability_url] = new_url
 
+			# Start fetching the URL.
 			conn = urllib.request.urlopen(request_url)
 		except urllib.error.HTTPError as e:
-			# An HTTP error has been found.
-
+			# An HTTP error has occurred.
 			if e.code in (403, 404, 412): # not found or tolerance exceeded
 				# Heuristically determine the static URL for some redirect scripts.
-				match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:%2F|/).+)''', archived_url, re.I) # URL in path
+				match = re.search('''[^/]/((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:%2F|/).+)''', archived_url, re.I) # URL in path
 				if not match:
-					match = re.search('''[\\?&][^=]+=((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', archived_url, re.I) # URL in query string
+					match = re.search('''[\\?&][^=]+=((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', archived_url, re.I) # URL in query string
 				if match: # found URL
-					# Decode the URL.
-					new_url = urllib.parse.unquote_plus(match.group(1))
-
-					# Add protocol if the URL is absolute but missing a protocol.
-					if new_url[0] != '/' and '://' not in new_url:
-						new_url = 'http://' + new_url
+					# Decode and sanitize the URL.
+					new_url = self.sanitize_redirect(urllib.parse.unquote_plus(match.group(1)))
 
 					# Redirect client to the URL.
 					_print('[r] [g]', new_url)
@@ -215,14 +212,16 @@ class Handler(socketserver.BaseRequestHandler):
 			else:
 				return self.send_error_page(http_version, e.code, e.reason)
 		except socket.timeout as e:
+			# A timeout has occurred.
 			_print('[!] Fetch timeout')
 			return self.send_error_page(http_version, 504, 'Gateway Timeout')
 		except:
+			# Some other fetch exception has occurred.
 			_print('[!] Fetch exception:')
 			traceback.print_exc()
 			return self.send_error_page(http_version, 502, 'Bad Gateway')
 
-		# get content type
+		# Get content type.
 		content_type = conn.info().get('Content-Type')
 		if content_type == None:
 			content_type = 'text/html'
@@ -231,11 +230,11 @@ class Handler(socketserver.BaseRequestHandler):
 			if idx > -1:
 				content_type = content_type[:idx]
 
-		# set the mode: [0]wayback [1]oocities
+		# Set the archive mode.
 		if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
-			mode = 1
+			mode = 1 # oocities
 		else:
-			mode = 0
+			mode = 0 # Wayback Machine
 
 		# Check content type to determine if this is HTML we need to patch.
 		# Wayback will add its HTML to anything it thinks is HTML.
@@ -243,7 +242,7 @@ class Handler(socketserver.BaseRequestHandler):
 		if not guessed_content_type:
 			guessed_content_type = content_type
 		if 'text/html' in guessed_content_type:
-			# Some dynamically generated links may end up pointing to
+			# Some dynamically-generated links may end up pointing to
 			# web.archive.org. Correct that by redirecting the Wayback
 			# portion of the URL away if it ends up being HTML consumed
 			# through the QUICK_IMAGES interface.
@@ -267,19 +266,25 @@ class Handler(socketserver.BaseRequestHandler):
 			data = conn.read()
 
 			# Patch the page.
-			if mode == 0: # wayback
+			if mode == 0: # Wayback Machine
+				# Check if this is a Wayback Machine page.
 				if b'<title>Wayback Machine</title>' in data:
-					if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data: # exclusion error (robots.txt?)
+					# Check if this is an exclusion (robots.txt?) error page.
+					if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data:
 						return self.send_error_page(http_version, 403, 'URL excluded')
 
+					# Check if this is a media playback iframe page.
+					# Some websites (especially ones that use frames)
+					# inexplicably render inside a media playback iframe.
+					# In that case, a simple redirect would result in a
+					# redirect loop, so fetch and render the URL instead.
 					match = re.search(b'''<iframe id="playback" src="((?:(?:https?:)?//web.archive.org)?/web/[^"]+)"''', data)
-					if match: # media playback iframe
-						# Some websites (especially ones that use frames)
-						# inexplicably render inside a media playback iframe.
-						# In that case, a simple redirect would result in a
-						# redirect loop. Download the URL and render it instead.
+					if match:
+						# Extract the content URL.
 						request_url = match.group(1).decode('ascii', 'ignore')
 						archived_url = '/'.join(request_url.split('/')[5:])
+
+						# Start fetching the URL.
 						_print('[f]', archived_url)
 						try:
 							conn = urllib.request.urlopen(request_url)
@@ -304,25 +309,26 @@ class Handler(socketserver.BaseRequestHandler):
 							data = conn.read()
 						else:
 							# Pass non-HTML data through.
-							self.send_response_headers(conn, http_version, content_type, request_url)
-							while True:
-								data = conn.read(1024)
-								if not data: break
-								self.request.sendall(data)
-							self.request.close()
-							return
+							return self.send_passthrough(conn, http_version, content_type, request_url)
 
+				# Check if this is a Wayback Machine redirect page.
 				if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
 					match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
 					if match:
-						# This is a Wayback redirect page, follow the redirect.
-						match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
+						# Sanitize the URL.
+						archived_url = self.sanitize_redirect(match.group(2).decode('ascii', 'ignore'))
+
+						# Add URL to the date LRU cache.
+						self.shared_state.date_cache[str(effective_date) + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
+
+						# Get the original HTTP redirect code.
+						match = re.search(b'''<p class="code shift red">Got an HTTP ([0-9]+)''', data)
 						try:
-							redirect_code = int(match2.group(1))
+							redirect_code = int(match.group(1))
 						except:
 							redirect_code = 302
-						archived_url = match.group(2).decode('ascii', 'ignore')
-						self.shared_state.date_cache[str(effective_date) + '\x00' + str(archived_url)] = match.group(1).decode('ascii', 'ignore')
+
+						# Redirect client to the URL.
 						_print('[r]', archived_url)
 						return self.send_redirect_page(http_version, archived_url, redirect_code)
 
@@ -333,10 +339,10 @@ class Handler(socketserver.BaseRequestHandler):
 				# Remove comments on footer.
 				data = re.sub(b'''<!--\\r?\\n     FILE ARCHIVED .*$''', b'', data, flags=re.S)
 				# Fix base tag.
-				data = re.sub(b'''(<base (?:[^>]*)href=(?:["\'])?)(?:(?:https?:)?//web.archive.org)?/web/(?:[^/]+)/''', b'\\1', data, flags=re.I + re.S)
+				data = re.sub(b'''(<base\\s+[^>]*href=["']?)(?:(?:https?:)?//web.archive.org)?/web/[^/]+/(?:[^:/]+://)?''', b'\\1http://', data, flags=re.I + re.S)
 
 				# Remove extraneous :80 from links.
-				data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
+				data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^/:]+)://([^/:]+):80/', b'\\1\\2/\\3://\\4/', data)
 				# Fix links.
 				if QUICK_IMAGES:
 					# QUICK_IMAGES works by intercepting asset URLs (those
@@ -349,14 +355,19 @@ class Handler(socketserver.BaseRequestHandler):
 					# username:password, which taints less but is not supported
 					# by all browsers - IE notably kills the whole page if it
 					# sees an iframe pointing to an invalid URL.
-					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
-						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
-					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)/', b'', data) # non-asset
+					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:/]+://)',
+						QUICK_IMAGES == 2 and b'\\3\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3', data)
+					def strip_https(match): # convert secure non-asset URLs to regular HTTP
+						first_component = match.group(1)
+						return first_component == b'https:' and b'http:' or first_component
+					data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/[^/]+/([^/]+)', strip_https, data)
 				else:
-					# Remove asset URLs while simultaneously adding them to the
-					# LRU cache with their respective date.
+					# Remove asset URLs while simultaneously adding them to the date LRU cache
+					# with their respective date and converting secure URLs to regular HTTP.
 					def add_to_date_cache(match):
 						orig_url = match.group(2)
+						if orig_url[:8] == b'https://':
+							orig_url = b'http://' + orig_url[8:]
 						self.shared_state.date_cache[str(effective_date) + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
 						return orig_url
 					data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
@@ -371,19 +382,24 @@ class Handler(socketserver.BaseRequestHandler):
 				data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
 
 				# Fix links.
-				data = re.sub(b'''//([^.]*)\\.oocities\\.com/''', b'//\\1.geocities.com/', data, flags=re.S)
+				data = re.sub(b'''//([^\\.]*\\.)?oocities\\.com/''', b'//\\1geocities.com/', data, flags=re.S)
 
 			# Send patched page.
 			self.send_response_headers(conn, http_version, content_type, request_url)
 			self.request.sendall(data)
+			self.request.close()
 		else:
 			# Pass non-HTML data through.
-			self.send_response_headers(conn, http_version, content_type, request_url)
-			while True:
-				data = conn.read(1024)
-				if not data: break
-				self.request.sendall(data)
-		
+			self.send_passthrough(conn, http_version, content_type, request_url)
+
+	def send_passthrough(self, conn, http_version, content_type, request_url):
+		"""Pass data through to the client unmodified (save for our headers)."""
+		self.send_response_headers(conn, http_version, content_type, request_url)
+		while True:
+			data = conn.read(1024)
+			if not data:
+				break
+			self.request.sendall(data)
 		self.request.close()
 
 	def send_response_headers(self, conn, http_version, content_type, request_url):
@@ -516,7 +532,19 @@ class Handler(socketserver.BaseRequestHandler):
 		settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
 		self.request.send(settingspage.encode('utf8', 'ignore'))
 		self.request.close()
-	
+
+	def sanitize_redirect(self, url):
+		"""Sanitize an URL for client-side redirection."""
+		if url[0] != '/' and '://' not in url:
+			# Add protocol if the URL is absolute but missing a protocol.
+			return 'http://' + url
+		elif url[:8].lower() == 'https://':
+			# Convert secure URLs to regular HTTP.
+			return 'http://' + url[8:]
+		else:
+			# No changes required.
+			return url
+
 	def signature(self):
 		"""Return the server signature."""
 		return 'WaybackProxy on {0}'.format(socket.gethostname())