Răsfoiți Sursa

Add ETag header for improved client-side caching

RichardG867 5 ani în urmă
părinte
comite
add5f0894a
1 a modificat fișierele cu 18 adăugiri și 14 ștergeri
  1. 18 14
      waybackproxy.py

+ 18 - 14
waybackproxy.py

@@ -25,7 +25,7 @@ class Handler(socketserver.BaseRequestHandler):
 			return self.error_page(http_version, 501, 'Not Implemented')
 		
 		# parse the URL
-		request_url = split[1]
+		request_url = archived_url = split[1]
 		parsed = urllib.parse.urlparse(request_url)
 		
 		# make a path
@@ -83,7 +83,6 @@ class Handler(socketserver.BaseRequestHandler):
 
 					# did we get an username:password with an asset date code?
 					if auth:
-						archived_url = request_url
 						request_url = 'http://web.archive.org/web/{0}/{1}'.format(auth.replace(':', ''), archived_url)
 					else:
 						archived_url = '/'.join(request_url.split('/')[5:])
@@ -99,22 +98,26 @@ class Handler(socketserver.BaseRequestHandler):
 							raise e
 			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
 				# apply GEOCITIES_FIX and pass it through
-				split = request_url.split('/')
+				_print('[>] {0}'.format(archived_url))
+
+				split = archived_url.split('/')
 				hostname = split[2] = 'www.oocities.org'
 				request_url = '/'.join(split)
 				
-				_print('[>] {0}'.format(request_url))
 				conn = urllib.request.urlopen(request_url)
 			else:
 				# get from Wayback
-				_print('[>] {0}'.format(request_url))
-				conn = urllib.request.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url))
+				_print('[>] {0}'.format(archived_url))
+
+				request_url = 'http://web.archive.org/web/{0}/{1}'.format(DATE, archived_url)
+
+				conn = urllib.request.urlopen(request_url)
 		except urllib.error.HTTPError as e:
 			# an error has been found
 
 			# 403 or 404 => heuristically determine the static URL for some redirect scripts
 			if e.code in (403, 404):
-				match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)?(?:url)?=(http[^&]+)''', request_url, re.IGNORECASE)
+				match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)?(?:url)?=(http[^&]+)''', archived_url, re.IGNORECASE)
 				if match:
 					# we found it
 					new_url = urllib.parse.unquote_plus(match.group(1))
@@ -140,7 +143,7 @@ class Handler(socketserver.BaseRequestHandler):
 			# through the QUICK_IMAGES interface.
 			if hostname == 'web.archive.org':
 				conn.close()
-				return self.redirect_page(http_version, '/'.join(request_url.split('/')[5:]), 301)
+				return self.redirect_page(http_version, '/'.join(archived_url.split('/')[5:]), 301)
 
 			# consume all data
 			data = conn.read()
@@ -157,7 +160,8 @@ class Handler(socketserver.BaseRequestHandler):
 						# In that case, a simple redirect would result in a
 						# redirect loop. Download the URL and render it instead.
 						request_url = match.group(1).decode('ascii', 'ignore')
-						print('[f]', request_url)
+						archived_url = '/'.join(request_url.split('/')[5:])
+						print('[f]', archived_url)
 						try:
 							conn = urllib.request.urlopen(request_url)
 						except urllib.error.HTTPError as e:
@@ -177,9 +181,9 @@ class Handler(socketserver.BaseRequestHandler):
 							redirect_code = match2.group(1)
 						else:
 							redirect_code = 302
-						request_url = match.group(1).decode('ascii', 'ignore')
-						print('[r]', request_url)
-						return self.redirect_page(http_version, request_url, redirect_code)
+						archived_url = match.group(1).decode('ascii', 'ignore')
+						print('[r]', archived_url)
+						return self.redirect_page(http_version, archived_url, redirect_code)
 
 				# pre-toolbar scripts and CSS
 				data = re.sub(b'<script src="//archive\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->', b'', data, flags=re.S)
@@ -222,10 +226,10 @@ class Handler(socketserver.BaseRequestHandler):
 				# fix links
 				data = re.sub(b'//([^.]*)\.oocities\.com/', b'//\\1.geocities.com/', data, flags=re.S)
 
-			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
 			self.request.sendall(data)
 		else: # other data
-			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
 
 			while True:
 				data = conn.read(1024)