Bladeren bron

Transparent proxying support

RichardG867 5 jaren geleden
bovenliggende
commit
7aea217f29
2 gewijzigde bestanden met toevoegingen van 34 en 20 verwijderingen
  1. 3 2
      README.md
  2. 31 18
      waybackproxy.py

+ 3 - 2
README.md

@@ -1,6 +1,6 @@
 # WaybackProxy
 
-WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archive Wayback Machine](http://web.archive.org) and [OoCities](http://www.oocities.org), returning the original antique-browser-friendly markup.
+WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archive Wayback Machine](http://web.archive.org) and [OoCities](http://www.oocities.org), returning the original retro-browser-friendly markup.
 
 ![1999 Google viewed on Internet Explorer 4.0 on Windows 95](http://i.imgur.com/tXsLc6O.png)
 
@@ -8,9 +8,10 @@ WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archi
 
 1. Edit `config.py` to your liking
 2. Start `waybackproxy.py`
-3. Set up your antique browser:
+3. Set up your retro browser:
 	* If your browser supports proxy auto-configuration, set the auto-configuration URL to `http://ip:port/proxy.pac` where `ip` is the IP of the system running WaybackProxy and `port` is the proxy's port (8888 by default).
 	* If proxy auto-configuration is not supported or fails to work, set the browser to use an HTTP proxy at that IP and port instead.
+	* Transparent proxying is also supported for advanced users. No configuration on WaybackProxy's end is required. Client machines must be pointed at a dummy DNS server so they can find the proxy; `dnsmasq -A "/#/ip"` is a good choice.
 4. Try it out! You can edit most settings that are in `config.py` by browsing to http://web.archive.org while on the proxy, although you must edit `config.py` to make them permanent.
 5. Press Ctrl+C to stop
 

+ 31 - 18
waybackproxy.py

@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import base64, datetime, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
 from config import *
 
@@ -27,20 +27,8 @@ class Handler(socketserver.BaseRequestHandler):
 			# only GET is implemented
 			return self.error_page(http_version, 501, 'Not Implemented')
 		
-		# parse the URL
-		request_url = archived_url = split[1]
-		parsed = urllib.parse.urlparse(request_url)
-		
-		# make a path
-		path = parsed.path
-		if parsed.query != '': path += '?' + parsed.query
-		if path == '': path == '/'
-		
-		# get the hostname for later
-		host = parsed.netloc.split(':')
-		hostname = host[0]
-		
-		# read out the headers, saving the PAC file host
+		# read out the headers
+		request_host = None
 		pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
 		effective_date = DATE
 		auth = None
@@ -48,7 +36,7 @@ class Handler(socketserver.BaseRequestHandler):
 			line = f.readline()
 			ll = line.lower()
 			if ll[:6] == 'host: ':
-				pac_host = line[6:].rstrip('\r\n')
+				pac_host = request_host = line[6:].rstrip('\r\n')
 				if ':' not in pac_host: # who would run this on port 80 anyway?
 					pac_host += ':80'
 			elif ll[:21] == 'x-waybackproxy-date: ':
@@ -57,9 +45,32 @@ class Handler(socketserver.BaseRequestHandler):
 			elif ll[:21] == 'authorization: basic ':
 				# asset date code passed as username:password
 				auth = base64.b64decode(ll[21:])
-		original_date = effective_date
+
+		# parse the URL
+		pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da')
+		if split[1][0] == '/' and split[1] not in pac_file_paths:
+			# just a path (not corresponding to a PAC file) => transparent proxy
+			# Host header and therefore HTTP/1.1 are required
+			if not request_host:
+				return self.error_page(http_version, 400, 'Host header missing')
+			archived_url = 'http://' + request_host + split[1]
+		else:
+			# full URL => explicit proxy
+			archived_url = split[1]
+		request_url = archived_url
+		parsed = urllib.parse.urlparse(request_url)
+		
+		# make a path
+		path = parsed.path
+		if parsed.query != '': path += '?' + parsed.query
+		if path == '': path == '/'
+		
+		# get the hostname for later
+		host = parsed.netloc.split(':')
+		hostname = host[0]
 
 		# get cached date for redirects, if available
+		original_date = effective_date
 		effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
 
 		# get date from username:password, if available
@@ -67,7 +78,7 @@ class Handler(socketserver.BaseRequestHandler):
 			effective_date = auth.replace(':', '')
 
 		try:
-			if path in ('/proxy.pac', '/wpad.dat', '/wpad.da'):
+			if path in pac_file_paths:
 				# PAC file to bypass QUICK_IMAGES requests
 				pac  = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n'''
 				pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
@@ -332,6 +343,8 @@ class Handler(socketserver.BaseRequestHandler):
 			errorpage += 'WaybackProxy only implements the GET method.'
 		elif code == 412: # outside of tolerance
 			errorpage += 'The earliest snapshot for this page is outside of the configured tolerance interval.'
+		elif code == 400 and reason == 'Host header missing': # no host header in transparent mode
+			errorpage += 'WaybackProxy\'s transparent mode requires an HTTP/1.1 compliant client.'
 		else: # another error
 			errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'