5 years ago · add5f0894a
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -25,7 +25,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			return self.error_page(http_version, 501, 'Not Implemented')
			
 
				 		
			
 
				 		# parse the URL
			
 
				-		request_url = split[1]
			
 
				+		request_url = archived_url = split[1]
			
 
				 		parsed = urllib.parse.urlparse(request_url)
			
 
				 		
			
 
				 		# make a path
			
@@ -83,7 +83,6 @@ class Handler(socketserver.BaseRequestHandler):
 
				 
			
 
				 					# did we get an username:password with an asset date code?
			
 
				 					if auth:
			
 
				-						archived_url = request_url
			
 
				 						request_url = 'http://web.archive.org/web/{0}/{1}'.format(auth.replace(':', ''), archived_url)
			
 
				 					else:
			
 
				 						archived_url = '/'.join(request_url.split('/')[5:])
			
@@ -99,22 +98,26 @@ class Handler(socketserver.BaseRequestHandler):
 
				 							raise e
			
 
				 			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
			
 
				 				# apply GEOCITIES_FIX and pass it through
			
 
				-				split = request_url.split('/')
			
 
				+				_print('[>] {0}'.format(archived_url))
			
 
				+
			
 
				+				split = archived_url.split('/')
			
 
				 				hostname = split[2] = 'www.oocities.org'
			
 
				 				request_url = '/'.join(split)
			
 
				 				
			
 
				-				_print('[>] {0}'.format(request_url))
			
 
				 				conn = urllib.request.urlopen(request_url)
			
 
				 			else:
			
 
				 				# get from Wayback
			
 
				-				_print('[>] {0}'.format(request_url))
			
 
				-				conn = urllib.request.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url))
			
 
				+				_print('[>] {0}'.format(archived_url))
			
 
				+
			
 
				+				request_url = 'http://web.archive.org/web/{0}/{1}'.format(DATE, archived_url)
			
 
				+
			
 
				+				conn = urllib.request.urlopen(request_url)
			
 
				 		except urllib.error.HTTPError as e:
			
 
				 			# an error has been found
			
 
				 
			
 
				 			# 403 or 404 => heuristically determine the static URL for some redirect scripts
			
 
				 			if e.code in (403, 404):
			
 
				-				match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)?(?:url)?=(http[^&]+)''', request_url, re.IGNORECASE)
			
 
				+				match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)?(?:url)?=(http[^&]+)''', archived_url, re.IGNORECASE)
			
 
				 				if match:
			
 
				 					# we found it
			
 
				 					new_url = urllib.parse.unquote_plus(match.group(1))
			
@@ -140,7 +143,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			# through the QUICK_IMAGES interface.
			
 
				 			if hostname == 'web.archive.org':
			
 
				 				conn.close()
			
 
				-				return self.redirect_page(http_version, '/'.join(request_url.split('/')[5:]), 301)
			
 
				+				return self.redirect_page(http_version, '/'.join(archived_url.split('/')[5:]), 301)
			
 
				 
			
 
				 			# consume all data
			
 
				 			data = conn.read()
			
@@ -157,7 +160,8 @@ class Handler(socketserver.BaseRequestHandler):
 
				 						# In that case, a simple redirect would result in a
			
 
				 						# redirect loop. Download the URL and render it instead.
			
 
				 						request_url = match.group(1).decode('ascii', 'ignore')
			
 
				-						print('[f]', request_url)
			
 
				+						archived_url = '/'.join(request_url.split('/')[5:])
			
 
				+						print('[f]', archived_url)
			
 
				 						try:
			
 
				 							conn = urllib.request.urlopen(request_url)
			
 
				 						except urllib.error.HTTPError as e:
			
@@ -177,9 +181,9 @@ class Handler(socketserver.BaseRequestHandler):
 
				 							redirect_code = match2.group(1)
			
 
				 						else:
			
 
				 							redirect_code = 302
			
 
				-						request_url = match.group(1).decode('ascii', 'ignore')
			
 
				-						print('[r]', request_url)
			
 
				-						return self.redirect_page(http_version, request_url, redirect_code)
			
 
				+						archived_url = match.group(1).decode('ascii', 'ignore')
			
 
				+						print('[r]', archived_url)
			
 
				+						return self.redirect_page(http_version, archived_url, redirect_code)
			
 
				 
			
 
				 				# pre-toolbar scripts and CSS
			
 
				 				data = re.sub(b'<script src="//archive\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->', b'', data, flags=re.S)
			
@@ -222,10 +226,10 @@ class Handler(socketserver.BaseRequestHandler):
 
				 				# fix links
			
 
				 				data = re.sub(b'//([^.]*)\.oocities\.com/', b'//\\1.geocities.com/', data, flags=re.S)
			
 
				 
			
 
				-			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
			
 
				+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
			
 
				 			self.request.sendall(data)
			
 
				 		else: # other data
			
 
				-			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
			
 
				+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
			
 
				 
			
 
				 			while True:
			
 
				 				data = conn.read(1024)