2 years ago · 903f3d5f6d
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,5 +24,7 @@ ARG SETTINGS_PAGE=true
 
															 EXPOSE ${LISTEN_PORT}
														
 
															+RUN pip install -r /app/requirements.txt
														
 
															+
														
 
															 CMD [ "sh" , "/app/startup.sh" ]
														
 
															 #CMD [ "python" , "/app/waybackproxy.py" ]
														
--- a/README.md
+++ b/README.md
@@ -6,16 +6,19 @@ WaybackProxy is a retro-friendly HTTP proxy which retrieves pages from the [Inte
 
															 ## Setup
														
 
															+Python 3.5 or newer is required.
														
 
															+
														
 
															 1. Edit `config.json` to your liking
														
 
															 2. Optionally exclude domains from being proxied by adding them to `whitelist.txt`
														
 
															-3. Start `waybackproxy.py` (Python 3 is required)
														
 
															-4. Set up your retro browser:
														
 
															+3. Install dependencies: `pip install --user -r requirements.txt`
														
 
															+4. Start `waybackproxy.py`
														
 
															+5. Set up your retro browser:
														
 
															 	* If your browser supports proxy auto-configuration, set the auto-configuration URL to `http://ip:port/proxy.pac` where `ip` is the IP of the system running WaybackProxy and `port` is the proxy's port (8888 by default).
														
 
															 	* If proxy auto-configuration is not supported or fails to work, set the browser to use an HTTP proxy at that IP and port instead.
														
 
															 	* Transparent proxying is also supported for advanced users, with no configuration to WaybackProxy itself required.
														
 
															 		* The easiest way to set up a transparent WaybackProxy is to run it on port 80 ([this cannot be done on Linux without security implications](https://unix.stackexchange.com/questions/87348/capabilities-for-a-script-on-linux)\), set up a fake DNS server - such as `dnsmasq -A "/#/ip"` where `ip` is the IP of the system running WaybackProxy - to redirect all requests to the proxy, and point client machines at that DNS server.
														
 
															-5. Try it out! You can edit most settings that are in `config.json` by browsing to http://web.archive.org while on the proxy, although you must edit `config.json` to make them permanent.
														
 
															-6. Press Ctrl+C to stop the proxy
														
 
															+6. Try it out! You can edit most settings that are in `config.json` by browsing to http://web.archive.org while on the proxy, although you must edit `config.json` to make them permanent.
														
 
															+7. Press Ctrl+C to stop the proxy
														
 
															 ## Docker Container
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 
															+urllib3>=1.25
														
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -1,5 +1,10 @@
 
															 #!/usr/bin/env python3
														
 
															-import base64, datetime, json, lrudict, re, socket, socketserver, string, sys, threading, traceback, urllib.request, urllib.error, urllib.parse
														
 
															+import base64, datetime, json, lrudict, re, socket, socketserver, string, sys, threading, time, traceback, urllib.parse
														
 
															+try:
														
 
															+	import urllib3
														
 
															+except ImportError:
														
 
															+	print('WaybackProxy now requires urllib3 to be installed. Follow setup step 3 on the readme to fix this.')
														
 
															+	sys.exit(1)
														
 
															 from config_handler import *
														
 
															 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
														
@@ -10,6 +15,10 @@ class SharedState:
 
															 	"""Class for storing shared state across instances of Handler."""
														
 
															 	def __init__(self):
														
 
															+		# Create urllib3 connection pool.
														
 
															+		self.http = urllib3.PoolManager(maxsize=4, block=True)
														
 
															+		urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
														
 
															+
														
 
															 		# Create internal LRU dictionary for preserving URLs on redirect.
														
 
															 		self.date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
														
@@ -146,7 +155,7 @@ class Handler(socketserver.BaseRequestHandler):
 
															 				# Get from the Wayback Machine.
														
 
															 				_print('[>]', archived_url)
														
 
															-				request_url = 'http://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url)
														
 
															+				request_url = 'https://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url)
														
 
															 			# Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors.
														
 
															 			if self.shared_state.availability_cache != None:
														
@@ -171,7 +180,8 @@ class Handler(socketserver.BaseRequestHandler):
 
															 					else:
														
 
															 						# Not in cache => contact API.
														
 
															 						try:
														
 
															-							availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14], timeout=10).read())
														
 
															+							availability_endpoint = 'https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14]
														
 
															+							availability = json.loads(self.shared_state.http.request('GET', availability_endpoint, timeout=10, retries=1).data)
														
 
															 							closest = availability.get('archived_snapshots', {}).get('closest', {})
														
 
															 							new_date = closest.get('timestamp', None)
														
 
															 						except:
														
@@ -194,37 +204,39 @@ class Handler(socketserver.BaseRequestHandler):
 
															 							request_url = self.shared_state.availability_cache[availability_url] = new_url
														
 
															 			# Start fetching the URL.
														
 
															-			conn = urllib.request.urlopen(request_url)
														
 
															-		except urllib.error.HTTPError as e:
														
 
															-			# An HTTP error has occurred.
														
 
															-			if e.code in (403, 404): # not found
														
 
															+			retry = urllib3.util.retry.Retry(total=10, connect=5, read=5, redirect=5, backoff_factor=0.5)
														
 
															+			conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
														
 
															+		except urllib3.exceptions.MaxRetryError as e:
														
 
															+			_print('[!] Fetch retries exceeded:', e.reason)
														
 
															+			return self.send_error_page(http_version, 504, 'Gateway Timeout')
														
 
															+		except:
														
 
															+			# Some other fetch exception has occurred.
														
 
															+			_print('[!] Fetch exception:')
														
 
															+			traceback.print_exc()
														
 
															+			return self.send_error_page(http_version, 502, 'Bad Gateway')
														
 
															+
														
 
															+		# Check for HTTP errors.
														
 
															+		if conn.status != 200:
														
 
															+			if conn.status in (403, 404): # not found
														
 
															 				if self.guess_and_send_redirect(http_version, archived_url):
														
 
															+					conn.release_conn()
														
 
															 					return
														
 
															-			elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop
														
 
															+			elif conn.status in (301, 302): # urllib-generated error about an infinite redirect loop
														
 
															+				conn.release_conn()
														
 
															 				_print('[!] Infinite redirect loop')
														
 
															 				return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
														
 
															-			if e.code != 412: # tolerance exceeded has its own error message above
														
 
															-				_print('[!]', e.code, e.reason)
														
 
															+			if conn.status != 412: # tolerance exceeded has its own error message above
														
 
															+				_print('[!]', conn.status, conn.reason)
														
 
															 			# If the memento Link header is present, this is a website error
														
 
															 			# instead of a Wayback error. Pass it along if that's the case.
														
 
															-			if 'Link' in e.headers:
														
 
															-				conn = e
														
 
															-			else:
														
 
															-				return self.send_error_page(http_version, e.code, e.reason)
														
 
															-		except socket.timeout as e:
														
 
															-			# A timeout has occurred.
														
 
															-			_print('[!] Fetch timeout')
														
 
															-			return self.send_error_page(http_version, 504, 'Gateway Timeout')
														
 
															-		except:
														
 
															-			# Some other fetch exception has occurred.
														
 
															-			_print('[!] Fetch exception:')
														
 
															-			traceback.print_exc()
														
 
															-			return self.send_error_page(http_version, 502, 'Bad Gateway')
														
 
															+			if 'Link' not in conn.headers:
														
 
															+				conn.release_conn()
														
 
															+				return self.send_error_page(http_version, conn.status, conn.reason)
														
 
															 		# Get content type.
														
 
															-		content_type = conn.info().get('Content-Type')
														
 
															+		content_type = conn.headers.get('Content-Type')
														
 
															 		if content_type == None:
														
 
															 			content_type = 'text/html'
														
 
															 		elif not CONTENT_TYPE_ENCODING:
														
@@ -240,7 +252,7 @@ class Handler(socketserver.BaseRequestHandler):
 
															 		# Check content type to determine if this is HTML we need to patch.
														
 
															 		# Wayback will add its HTML to anything it thinks is HTML.
														
 
															-		guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type')
														
 
															+		guessed_content_type = conn.headers.get('X-Archive-Guessed-Content-Type')
														
 
															 		if not guessed_content_type:
														
 
															 			guessed_content_type = content_type
														
 
															 		if 'text/html' in guessed_content_type:
														
@@ -249,25 +261,26 @@ class Handler(socketserver.BaseRequestHandler):
 
															 			# portion of the URL away if it ends up being HTML consumed
														
 
															 			# through the QUICK_IMAGES interface.
														
 
															 			if hostname == 'web.archive.org':
														
 
															-				conn.close()
														
 
															+				conn.release_conn()
														
 
															 				archived_url = '/'.join(request_url.split('/')[5:])
														
 
															 				_print('[r] [QI]', archived_url)
														
 
															 				return self.send_redirect_page(http_version, archived_url, 301)
														
 
															 			# Check if the date is within tolerance.
														
 
															 			if DATE_TOLERANCE != None:
														
 
															-				match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl())
														
 
															+				match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl() or '')
														
 
															 				if match:
														
 
															 					requested_date = match.group(1)
														
 
															 					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
														
 
															+						conn.release_conn()
														
 
															 						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
														
 
															-						conn.close()
														
 
															 						if not self.guess_and_send_redirect(http_version, archived_url):
														
 
															 							self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
														
 
															 						return
														
 
															 			# Consume all data.
														
 
															 			data = conn.read()
														
 
															+			conn.release_conn()
														
 
															 			# Patch the page.
														
 
															 			if mode == 0: # Wayback Machine
														
@@ -290,20 +303,19 @@ class Handler(socketserver.BaseRequestHandler):
 
															 						# Start fetching the URL.
														
 
															 						_print('[f]', archived_url)
														
 
															-						try:
														
 
															-							conn = urllib.request.urlopen(request_url)
														
 
															-						except urllib.error.HTTPError as e:
														
 
															-							_print('[!]', e.code, e.reason)
														
 
															+						conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
														
 
															+						
														
 
															+						if conn.status != 200:
														
 
															+							_print('[!]', conn.status, conn.reason)
														
 
															 							# If the memento Link header is present, this is a website error
														
 
															 							# instead of a Wayback error. Pass it along if that's the case.
														
 
															-							if 'Link' in e.headers:
														
 
															-								conn = e
														
 
															-							else:
														
 
															-								return self.send_error_page(http_version, e.code, e.reason)
														
 
															+							if 'Link' not in conn.headers:
														
 
															+								conn.release_conn()
														
 
															+								return self.send_error_page(http_version, conn.status, conn.reason)
														
 
															 						# Identify content type so we don't modify non-HTML content.
														
 
															-						content_type = conn.info().get('Content-Type')
														
 
															+						content_type = conn.headers.get('Content-Type')
														
 
															 						if not CONTENT_TYPE_ENCODING:
														
 
															 							idx = content_type.find(';')
														
 
															 							if idx > -1:
														
@@ -311,6 +323,7 @@ class Handler(socketserver.BaseRequestHandler):
 
															 						if 'text/html' in content_type:
														
 
															 							# Consume all data and proceed with patching the page.
														
 
															 							data = conn.read()
														
 
															+							conn.release_conn()
														
 
															 						else:
														
 
															 							# Pass non-HTML data through.
														
 
															 							return self.send_passthrough(conn, http_version, content_type, request_url)
														
@@ -399,22 +412,16 @@ class Handler(socketserver.BaseRequestHandler):
 
															 	def send_passthrough(self, conn, http_version, content_type, request_url):
														
 
															 		"""Pass data through to the client unmodified (save for our headers)."""
														
 
															 		self.send_response_headers(conn, http_version, content_type, request_url, content_length=True)
														
 
															-		while True:
														
 
															-			data = conn.read(1024)
														
 
															-			if not data:
														
 
															-				break
														
 
															+		for data in conn.stream(1024):
														
 
															 			self.request.sendall(data)
														
 
															+		conn.release_conn()
														
 
															 		self.request.close()
														
 
															 	def send_response_headers(self, conn, http_version, content_type, request_url, content_length=False):
														
 
															 		"""Generate and send the response headers."""
														
 
															 		# Pass the HTTP version, and error code if there is one.
														
 
															-		response = http_version
														
 
															-		if isinstance(conn, urllib.error.HTTPError):
														
 
															-			response += ' {0} {1}'.format(conn.code, conn.reason.replace('\n', ' '))
														
 
															-		else:
														
 
															-			response += ' 200 OK'
														
 
															+		response = '{0} {1} {2}'.format(http_version, conn.status, conn.reason.replace('\n', ' '))
														
 
															 		# Add Content-Type, Content-Length and the caching ETag.
														
 
															 		response += '\r\nContent-Type: ' + content_type
														
@@ -422,6 +429,7 @@ class Handler(socketserver.BaseRequestHandler):
 
															 			response += '\r\nContent-Length: ' + str(content_length)
														
 
															 			content_length = False # don't pass the original length through
														
 
															 		response += '\r\nETag: "' + request_url.replace('"', '') + '"'
														
 
															+		response += '\r\nConnection: close' # helps with IE6 trying to use proxy keep alive and holding half-open connections
														
 
															 		# Pass X-Archive-Orig-* (and Content-Length if requested) headers through.
														
 
															 		for header in conn.headers:
														
@@ -457,7 +465,7 @@ class Handler(socketserver.BaseRequestHandler):
 
															 			description = 'WaybackProxy\'s transparent mode requires an HTTP/1.1 compliant client.'
														
 
															 		else: # another error
														
 
															 			description = 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
														
 
															-		
														
 
															+
														
 
															 		# Read error page file.
														
 
															 		try:
														
 
															 			with open('error.html', 'r', encoding='utf8', errors='ignore') as f: