ソースを参照

Rebase the much better HTML patching code I had stuck in a private fork

RichardG867 5 年 前
コミット
413a67f763
2 ファイル変更123 行追加91 行削除
  1. 3 1
      config.py
  2. 120 90
      waybackproxy.py

+ 3 - 1
config.py

@@ -9,7 +9,9 @@ GEOCITIES_FIX = True
 
 # Use the Wayback-tampered URL as a shortcut when loading images.
 # May result in faster loads, but all images will point to 
-# http://web.archive.org/... as a result.
+# http://web.archive.org/... as a result. Set this value to 2 to enable an
+# experimental mode using username:passwords on top of the original URLs
+# instead (not supported by IE6 and some other browsers).
 QUICK_IMAGES = True
 
 # Allow the Content-Type header to contain an encoding. Some old browsers

+ 120 - 90
waybackproxy.py

@@ -39,18 +39,23 @@ class Handler(socketserver.BaseRequestHandler):
 		
 		# read out the headers, saving the PAC file host
 		pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
+		auth = None
 		while line.rstrip('\r\n') != '':
 			line = f.readline()
-			if line[:6].lower() == 'host: ':
+			ll = line.lower()
+			if ll[:6] == 'host: ':
 				pac_host = line[6:].rstrip('\r\n')
 				if ':' not in pac_host: # who would run this on port 80 anyway?
 					pac_host += ':80'
-			elif line[:21].lower() == 'x-waybackproxy-date: ':
+			elif ll[:21] == 'x-waybackproxy-date: ':
 				# API for a personal project of mine
 				new_date = line[21:].rstrip('\r\n')
 				if DATE != new_date:
 					DATE = new_date
 					print('[-] Header requested date', DATE)
+			elif ll[:21] == 'authorization: basic ':
+				# asset datecode passed as username:password
+				auth = base64.b64decode(ll[21:])
 		
 		try:
 			if path == '/proxy.pac':
@@ -68,15 +73,30 @@ class Handler(socketserver.BaseRequestHandler):
 				pac += b'''}\r\n'''
 				self.request.sendall(pac)
 				return
-			elif hostname == 'web.archive.org':
+			elif hostname == 'web.archive.org' or auth:
 				if path[:5] != '/web/':
 					# launch settings
 					return self.handle_settings(parsed.query)
 				else:
 					# pass-through requests to web.archive.org
 					# required for QUICK_IMAGES
-					_print('[>] [QI] {0}'.format('/'.join(request_url.split('/')[5:])))
-					conn = urllib.request.urlopen(request_url)
+
+					# did we get an username:password with an asset datecode?
+					if auth:
+						archived_url = request_url
+						request_url = 'http://web.archive.org/web/{0}/{1}'.format(auth.replace(':', ''), archived_url)
+					else:
+						archived_url = '/'.join(request_url.split('/')[5:])
+
+					_print('[>] [QI] {0}'.format(archived_url))
+					try:
+						conn = urllib.request.urlopen(request_url)
+					except urllib.error.HTTPError as e:
+						if e.code == 404:
+							# Try this file on another date, might be redundant
+							return self.redirect_page(http_version, archived_url)
+						else:
+							raise e
 			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
 				# apply GEOCITIES_FIX and pass it through
 				split = request_url.split('/')
@@ -106,97 +126,107 @@ class Handler(socketserver.BaseRequestHandler):
 		
 		# get content type
 		content_type = conn.info().get('Content-Type')
+		if content_type == None: content_type = 'text/html'
 		if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
 		
-		# send headers		
-		self.request.sendall(http_version.encode('ascii', 'ignore') + b' 200 OK\r\nContent-Type: ' + content_type.encode('ascii', 'ignore') + b'\r\n\r\n')
-		
 		# set the mode: [0]wayback [1]oocities
 		mode = 0
 		if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
 		
-		if content_type[:9] == 'text/html' in content_type: # HTML
-			toolbar = mode == 1 # oocities header starts without warning
-			redirect_page = False
-			for line in conn:
-				line = line.rstrip(b'\r\n')
-				
-				if mode == 0:
-					if toolbar:
-						for delimiter in (b'<\!-- END WAYBACK TOOLBAR INSERT -->', b'<\!-- End Wayback Rewrite JS Include -->'):
-							if re.search(delimiter, line):
-								# toolbar is done - resume relaying on the next line
-								toolbar = False
-								line = re.sub(delimiter, b'', line)
-								break
-						if toolbar: continue
-					elif redirect_page:
-						# this is a really bad way to deal with Wayback's 302
-						# pages, but necessary with the way this proxy works
-						match = re.search(b'<p class="impatient"><a href="/web/(?:[^/]+)/([^"]+)">Impatient\\?</a></p>', line)
-						if match:
-							line  = b'<title>WaybackProxy Redirect</title><meta http-equiv="refresh" content="0;url='
-							line += match.group(1)
-							line += b'"></head><body>If you are not redirected, <a href="'
-							line += match.group(1)
-							line += b'">click here</a>.</body></html>'
-							self.request.sendall(line)
-							break
-						continue
-					
-					if b'<base ' in line.lower():
-						# fix base
-						line = re.sub(b'(?:http://web\.archive\.org)?/web/([0-9]+)/', b'', line)
-					elif line == b'\t\t<title>Internet Archive Wayback Machine</title>':
-						# redirect 302s - see the redirect_page code above
-						redirect_page = True
-						continue
-					else:
-						for delimiter in (
-							b'<\!-- BEGIN WAYBACK TOOLBAR INSERT -->',
-							b'<script src="//archive\.org/([^"]+)" type="text/javascript"></script>'
-						):
-							if re.search(delimiter, line):
-								# remove the toolbar - stop relaying from now on
-								toolbar = True
-								line = re.sub(delimiter, b'', line)
-								break
-					
-					if QUICK_IMAGES:
-						# QUICK_IMAGES works by intercepting asset URLs (those
-						# with a date code ending in im_, js_...) and letting the
-						# proxy pass them through. This may reduce load time
-						# because Wayback doesn't have to hunt down the closest
-						# copy of that asset to DATE, as those URLs have specific
-						# date codes. The only side effect is tainting the HTML
-						# with web.archive.org URLs.
-						line = re.sub(b'(?:http://web.archive.org)?/web/([0-9]+)([a-z]+_)/',
-							b'http://web.archive.org/web/\\1\\2/', line)
-						line = re.sub(b'(?:http://web.archive.org)?/web/([0-9]+)/', b'', line)
-					else:
-						line = re.sub(b'(?:http://web.archive.org)?/web/([^/]+)/', b'', line)
-				elif mode == 1:
-					# remove the geocities/oocities-added code, which is
-					# conveniently wrapped around comments
-					if toolbar:
-						if line in (
-							b'<!-- text above generated by server. PLEASE REMOVE -->',
-							b'<!-- preceding code added by server. PLEASE REMOVE -->'
-						):
-							toolbar = False
-						continue
-					elif line == b'<!-- following code added by server. PLEASE REMOVE -->' \
-					or line[:54] == b'<!-- text below generated by server. PLEASE REMOVE -->':
-						toolbar = True
-						continue
-					
-					# taint? what taint?
-					line = line.replace(b'http://oocities.com', b'http://geocities.com')
-					line = line.replace(b'http://www.oocities.com', b'http://www.geocities.com')
-				
-				self.request.sendall(line)
-				self.request.sendall(b'\r\n')
+		if 'text/html' in content_type: # HTML
+			# Some dynamically generated links may end up pointing to
+			# web.archive.org. Correct that by redirecting the Wayback
+			# portion of the URL away if it ends up being HTML consumed
+			# through the QUICK_IMAGES interface.
+			if hostname == 'web.archive.org':
+				conn.close()
+				return self.redirect_page(http_version, '/'.join(request_url.split('/')[5:]), 301)
+
+			# consume all data
+			data = conn.read()
+
+			# patch the page
+			if mode == 0: # wayback
+				if b'<title>Wayback Machine</title>' in data:
+					match = re.search(b'<iframe id="playback" src="((?:(?:http(?:s)?:)?//web.archive.org)?/web/[^"]+)"', data)
+					if match:
+						# media playback iframe
+
+						# Some websites (especially ones that use frames)
+						# inexplicably render inside a media playback iframe.
+						# In that case, a simple redirect would result in a
+						# redirect loop. Download the URL and render it instead.
+						new_url = match.group(1).decode('ascii', 'ignore')
+						print('[f]', new_url)
+						try:
+							conn = urllib.request.urlopen(new_url)
+						except urllib.error.HTTPError as e:
+							_print('[!]', e.code, e.reason)
+							return self.error_page(http_version, e.code, e.reason)
+
+						content_type = conn.info().get('Content-Type')
+						if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
+						data = conn.read()
+
+				if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
+					match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/(?:[^/]+)/([^"]+)">Impatient\?</a></p>', data)
+					if match:
+						# wayback redirect page, follow it
+						match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
+						if match2:
+							redirect_code = match2.group(1)
+						else:
+							redirect_code = 302
+						new_url = match.group(1).decode('ascii', 'ignore')
+						print('[r]', new_url)
+						return self.redirect_page(http_version, new_url, redirect_code)
+
+				# pre-toolbar scripts and CSS
+				data = re.sub(b'<script src="//archive\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->', b'', data, flags=re.S)
+				# toolbar
+				data = re.sub(b'<!-- BEGIN WAYBACK TOOLBAR INSERT -->(?:.*)<!-- END WAYBACK TOOLBAR INSERT -->', b'', data, flags=re.S)
+				# comments on footer
+				data = re.sub(b'\n<!--\n     FILE ARCHIVED (?:.*)$', b'', data, flags=re.S)
+				# fix base tag
+				data = re.sub(b'(<base (?:[^>]*)href=(?:["\'])?)(?:(?:http(?:s)?:)?//web.archive.org)?/web/(?:[^/]+)/', b'\\1', data, flags=re.I + re.S)
+
+				# remove extraneous :80 from links
+				data = re.sub(b'((?:(?:http(?:s)?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
+				# fix links
+				if QUICK_IMAGES:
+					# QUICK_IMAGES works by intercepting asset URLs (those
+					# with a date code ending in im_, js_...) and letting the
+					# proxy pass them through. This may reduce load time
+					# because Wayback doesn't have to hunt down the closest
+					# copy of that asset to DATE, as those URLs have specific
+					# date codes. This taints the HTML with web.archive.org
+					# URLs. QUICK_IMAGES=2 uses the original URLs with an added
+					# username:password, which taints less but is not supported
+					# by all browsers - IE6 notably kills the whole page if it
+					# sees an iframe pointing to an invalid URL.
+					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
+						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
+					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
+				else:
+					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
+			elif mode == 1: # oocities
+				# viewport/cache-control/max-width code (header)
+				data = re.sub(b'^(?:.*?)\n\n', b'', data, flags=re.S)
+				# archive notice and tracking code (footer)
+				data = re.sub(b'<style> \n.zoomout { -webkit-transition: (?:.*)$', b'', data, flags=re.S)
+				# clearly labeled snippets from Geocities
+				data = re.sub(b'^(?:.*)<\!-- text above generated by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
+				data = re.sub(b'<\!-- following code added by server\. PLEASE REMOVE -->(?:.*)<\!-- preceding code added by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
+				data = re.sub(b'<\!-- text below generated by server\. PLEASE REMOVE -->(?:.*)$', b'', data, flags=re.S)
+
+				# fix links
+				data = re.sub(b'//([^.]*)\.oocities\.com/', b'//\\1.geocities.com/', data, flags=re.S)
+
+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
+			self.request.sendall(data)
 		else: # other data
+			self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type).encode('ascii', 'ignore'))
+
 			while True:
 				data = conn.read(1024)
 				if not data: break
@@ -239,7 +269,7 @@ class Handler(socketserver.BaseRequestHandler):
 		redirectpage += '">click here</a>.</p></body></html>'
 
 		# send redirect page and stop
-		self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
+		self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
 		self.request.close()
 	
 	def handle_settings(self, query):