%!s(int64=10) %!d(string=hai) anos · 95a2cd5b2a
--- a/README.md
+++ b/README.md
@@ -1,2 +1,13 @@
 
				 # WaybackProxy
			
 
				-HTTP proxy for tunneling requests through the Internet Archive Wayback Machine
			
 
				+
			
 
				+WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archive Wayback Machine](http://web.archive.org) and [OoCities](http://www.oocities.org), returning the original antique-browser-friendly markup.
			
 
				+
			
 
				+![1999 Google viewed on Internet Explorer 4.0 on Windows 95](http://i.imgur.com/tXsLc6O.png)
			
 
				+
			
 
				+## Setup
			
 
				+
			
 
				+1. Edit `settings.py`
			
 
				+2. Start `waybackproxy.py`
			
 
				+3. Set your antique browser to use a HTTP proxy at the IP and port WaybackProxy is listening on
			
 
				+4. Try it out. You can edit most settings that are in `settings.py` by going to http://web.archive.org while on the proxy!
			
 
				+5. Press Ctrl+C to stop
			
--- a/settings.py
+++ b/settings.py
@@ -0,0 +1,18 @@
 
				+# Listen port for the HTTP proxy
			
 
				+LISTEN_PORT = 8888
			
 
				+
			
 
				+# Date to get pages from Wayback (YYYY, YYYYMM or YYYYMMDD)
			
 
				+DATE = '1998'
			
 
				+
			
 
				+# Send Geocities requests to oocities.org
			
 
				+GEOCITIES_FIX = True
			
 
				+
			
 
				+# Use the Wayback-tampered URL as a shortcut when loading images.
			
 
				+# May result in faster loads, but all images will point to 
			
 
				+# http://web.archive.org/... as a result.
			
 
				+QUICK_IMAGES = True
			
 
				+
			
 
				+# Allow the Content-Type header to contain en encoding. Some old browsers
			
 
				+# (Mosaic?) don't understand that and fail to load anything - set this to
			
 
				+# False if you're using one of them.
			
 
				+CONTENT_TYPE_ENCODING = True
			
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -0,0 +1,228 @@
 
				+#!/usr/bin/env python
			
 
				+import re, socket, SocketServer, urllib2, urlparse
			
 
				+from settings import *
			
 
				+
			
 
				+class ThreadingTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
			
 
				+	"""TCPServer with ThreadingMixIn added."""
			
 
				+	pass
			
 
				+
			
 
				+class Handler(SocketServer.BaseRequestHandler):
			
 
				+	"""Main request handler."""
			
 
				+	def handle(self):
			
 
				+		"""Handle a request."""
			
 
				+		
			
 
				+		# readline is pretty convenient
			
 
				+		f = self.request.makefile()
			
 
				+		
			
 
				+		# read request line
			
 
				+		reqline = line = f.readline()
			
 
				+		split = line.rstrip('\r\n').split(' ')
			
 
				+		http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
			
 
				+		
			
 
				+		if split[0] != 'GET':
			
 
				+			# only GET is implemented
			
 
				+			return self.error_page(http_version, 501, 'Not Implemented')
			
 
				+		
			
 
				+		# parse the URL
			
 
				+		request_url = split[1]
			
 
				+		parsed = urlparse.urlparse(request_url)
			
 
				+		
			
 
				+		# make a path
			
 
				+		path = parsed.path
			
 
				+		if parsed.query != '': path += '?' + parsed.query
			
 
				+		if path == '': path == '/'
			
 
				+		
			
 
				+		# get the hostname for later
			
 
				+		host = parsed.netloc.split(':')
			
 
				+		hostname = host[0]
			
 
				+		
			
 
				+		# read out the headers
			
 
				+		while line.rstrip('\r\n') != '':
			
 
				+			line = f.readline()
			
 
				+		
			
 
				+		try:
			
 
				+			if hostname == 'web.archive.org':
			
 
				+				if path[:5] != '/web/':
			
 
				+					# launch settings
			
 
				+					return self.handle_settings(parsed.query)
			
 
				+				else:
			
 
				+					# pass-through requests to web.archive.org
			
 
				+					# required for QUICK_IMAGES
			
 
				+					print '[>] [QI] {0}'.format('/'.join(request_url.split('/')[5:]))
			
 
				+					conn = urllib2.urlopen(request_url)
			
 
				+			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
			
 
				+				# apply GEOCITIES_FIX and pass it through
			
 
				+				split = request_url.split('/')
			
 
				+				hostname = split[2] = 'www.oocities.org'
			
 
				+				request_url = '/'.join(split)
			
 
				+				
			
 
				+				print '[>] {0}'.format(request_url)
			
 
				+				conn = urllib2.urlopen(request_url)
			
 
				+			else:
			
 
				+				# get from Wayback
			
 
				+				print '[>] {0}'.format(request_url)
			
 
				+				conn = urllib2.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url))
			
 
				+		except urllib2.HTTPError as e:
			
 
				+			# an error has been found
			
 
				+			print '[!] {0} {1}'.format(e.code, e.reason)
			
 
				+			return self.error_page(http_version, e.code, e.reason)
			
 
				+		
			
 
				+		# get content type
			
 
				+		content_type = conn.info().getheader('Content-Type')
			
 
				+		if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
			
 
				+		
			
 
				+		# send headers		
			
 
				+		self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type))
			
 
				+		
			
 
				+		# set the mode: [0]wayback [1]oocities
			
 
				+		mode = 0
			
 
				+		if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
			
 
				+		
			
 
				+		if content_type[:9] == 'text/html' in content_type: # HTML
			
 
				+			toolbar = mode == 1 # oocities header starts without warning
			
 
				+			after_header = False
			
 
				+			redirect_page = False
			
 
				+			for line in conn:
			
 
				+				line = line.rstrip('\r\n')
			
 
				+				
			
 
				+				if mode == 0:
			
 
				+					if toolbar:
			
 
				+						if line == '<!-- END WAYBACK TOOLBAR INSERT -->':
			
 
				+							# toolbar is done - resume relaying on the next line
			
 
				+							toolbar = False
			
 
				+							after_header = True
			
 
				+						continue
			
 
				+					elif redirect_page:
			
 
				+						# this is a really bad way to deal with Wayback's 302
			
 
				+						# pages, but necessary with the way this proxy works
			
 
				+						match = re.search('<p class="impatient"><a href="/web/(?:[^/]+)/([^"]+)">Impatient\\?</a></p>', line)
			
 
				+						if match:
			
 
				+							line = '<title>WaybackProxy Redirect</title><meta http-equiv="refresh" content="0;url='
			
 
				+							line += match.group(1)
			
 
				+							line += '"></head><body>If you are not redirected, <a href="'
			
 
				+							line += match.group(1)
			
 
				+							line += '">click here</a>.</body></html>'
			
 
				+							self.request.sendall(line)
			
 
				+							break
			
 
				+						continue
			
 
				+					
			
 
				+					if not after_header:
			
 
				+						ll = line.lower()
			
 
				+						if line == '<script type="text/javascript" src="/static/js/analytics.js"></script>' or line == '<link type="text/css" rel="stylesheet" href="/static/css/banner-styles.css"/>' or line[:69] == '<script type="text/javascript">archive_analytics.values.server_name="':
			
 
				+							# remove the CSS and tracking scripts added to <head>
			
 
				+							continue
			
 
				+						elif ll[:6] == '<base ':
			
 
				+							# fix base
			
 
				+							line = re.sub('/web/([0-9]+)/', '', line)
			
 
				+					if line == '<!-- BEGIN WAYBACK TOOLBAR INSERT -->':
			
 
				+						# remove the toolbar - stop relaying from now on
			
 
				+						toolbar = True
			
 
				+						continue
			
 
				+					elif line == '\t\t<title>Internet Archive Wayback Machine</title>':
			
 
				+						# redirect 302s - see the redirect_page code above
			
 
				+						redirect_page = True
			
 
				+						continue
			
 
				+					
			
 
				+					if QUICK_IMAGES:
			
 
				+						# QUICK_IMAGES works by intercepting asset URLs (those
			
 
				+						# with a date code ending in im_, js_...) and letting the
			
 
				+						# proxy pass them through. This may reduce load time
			
 
				+						# because Wayback doesn't have to hunt down the closest
			
 
				+						# copy of that asset to DATE, as those URLs have specific
			
 
				+						# date codes. The only side effect is tainting the HTML
			
 
				+						# with web.archive.org URLs.
			
 
				+						line = re.sub('/web/([0-9]+)([a-z]+_)/',
			
 
				+							'http://web.archive.org/web/\\1\\2/', line)
			
 
				+						line = re.sub('/web/([0-9]+)/', '', line)
			
 
				+					else:
			
 
				+						line = re.sub('/web/([^/]+)/', '', line)
			
 
				+				elif mode == 1:
			
 
				+					# remove the geocities/oocities-added code, which is
			
 
				+					# conveniently wrapped around comments
			
 
				+					if toolbar:
			
 
				+						if line in ['<!-- text above generated by server. PLEASE REMOVE -->', '<!-- preceding code added by server. PLEASE REMOVE -->']:
			
 
				+							toolbar = False
			
 
				+						continue
			
 
				+					elif line == '<!-- following code added by server. PLEASE REMOVE -->' or line[:54] == '<!-- text below generated by server. PLEASE REMOVE -->':
			
 
				+						toolbar = True
			
 
				+						continue
			
 
				+					
			
 
				+					# taint? what taint?
			
 
				+					line = line.replace('http://oocities.com', 'http://geocities.com')
			
 
				+					line = line.replace('http://www.oocities.com', 'http://www.geocities.com')
			
 
				+				
			
 
				+				self.request.sendall(line)
			
 
				+				self.request.sendall('\r\n')
			
 
				+		else: # other data
			
 
				+			while True:
			
 
				+				data = conn.read(1024)
			
 
				+				if data == '': break
			
 
				+				self.request.sendall(data)
			
 
				+		
			
 
				+		self.request.close()
			
 
				+	
			
 
				+	def error_page(self, http_version, code, reason):
			
 
				+		"""Generate an error page."""
			
 
				+		
			
 
				+		# make error page
			
 
				+		errorpage = '<html><head><title>{0} {1}</title></head><body><h1>{1}</h1><p>'.format(code, reason)
			
 
				+		
			
 
				+		# add code information
			
 
				+		if code == 404: # page not archived
			
 
				+			errorpage += 'This page may not be archived by the Wayback Machine.'
			
 
				+		elif code == 403: # not crawled due to robots.txt
			
 
				+			errorpage += 'This page was not archived due to a robots.txt block.'
			
 
				+		elif code == 501: # method not implemented
			
 
				+			errorpage += 'WaybackProxy only implements the GET method.'
			
 
				+		else: # another error
			
 
				+			errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
			
 
				+		
			
 
				+		errorpage += '</p><hr><i>{0}</i></body></html>'.format(self.signature())
			
 
				+		
			
 
				+		# send error page and stop
			
 
				+		self.request.sendall('{0} {1} {2}\r\nContent-Length: {3}\r\n\r\n'.format(http_version, code, reason, len(errorpage)))
			
 
				+		self.request.sendall(errorpage)
			
 
				+		self.request.close()
			
 
				+	
			
 
				+	def handle_settings(self, query):
			
 
				+		"""Generate the settings page."""
			
 
				+	
			
 
				+		global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
			
 
				+		
			
 
				+		if query != '': # handle any parameters that may have been sent
			
 
				+			parsed = urlparse.parse_qs(query)
			
 
				+			
			
 
				+			if 'date' in parsed: DATE = parsed['date'][0]
			
 
				+			GEOCITIES_FIX = 'gcFix' in parsed
			
 
				+			QUICK_IMAGES = 'quickImages' in parsed
			
 
				+			CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
			
 
				+		
			
 
				+		# send the page and stop
			
 
				+		self.request.sendall('HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n')
			
 
				+		self.request.sendall('<html><head><title>WaybackProxy Settings</title></head><body><p><b>')
			
 
				+		self.request.sendall(self.signature())
			
 
				+		self.request.sendall('</b></p><form method="get" action="/"><p>Date to get pages from: <input type="text" name="date" size="8" value="')
			
 
				+		self.request.sendall(DATE)
			
 
				+		self.request.sendall('"><br><input type="checkbox" name="gcFix"')
			
 
				+		if GEOCITIES_FIX: self.request.sendall(' checked')
			
 
				+		self.request.sendall('> Geocities Fix<br><input type="checkbox" name="quickImages"')
			
 
				+		if QUICK_IMAGES: self.request.sendall(' checked')
			
 
				+		self.request.sendall('> Quick images<br><input type="checkbox" name="ctEncoding"')
			
 
				+		if CONTENT_TYPE_ENCODING: self.request.sendall(' checked')
			
 
				+		self.request.sendall('> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>')
			
 
				+		self.request.close()
			
 
				+	
			
 
				+	def signature(self):
			
 
				+		"""Return the server signature."""
			
 
				+		return 'WaybackProxy on {0}'.format(socket.gethostname())
			
 
				+
			
 
				+def main():
			
 
				+	"""Starts the server."""
			
 
				+	server = ThreadingTCPServer(('', 8888), Handler)
			
 
				+	try:
			
 
				+		server.serve_forever()
			
 
				+	except KeyboardInterrupt: # Ctrl+C to stop
			
 
				+		pass
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+	main()