#!/usr/bin/env python import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse from config import * class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCPServer with ThreadingMixIn added.""" pass class Handler(socketserver.BaseRequestHandler): """Main request handler.""" def handle(self): """Handle a request.""" global DATE # readline is pretty convenient f = self.request.makefile() # read request line reqline = line = f.readline() split = line.rstrip('\r\n').split(' ') http_version = len(split) > 2 and split[2] or 'HTTP/0.9' if split[0] != 'GET': # only GET is implemented return self.error_page(http_version, 501, 'Not Implemented') # parse the URL request_url = split[1] parsed = urllib.parse.urlparse(request_url) # make a path path = parsed.path if parsed.query != '': path += '?' + parsed.query if path == '': path == '/' # get the hostname for later host = parsed.netloc.split(':') hostname = host[0] # read out the headers, saving the PAC file host pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work while line.rstrip('\r\n') != '': line = f.readline() if line[:6].lower() == 'host: ': pac_host = line[6:].rstrip('\r\n') if ':' not in pac_host: # who would run this on port 80 anyway? pac_host += ':80' elif line[:21].lower() == 'x-waybackproxy-date: ': # API for a personal project of mine new_date = line[21:].rstrip('\r\n') if DATE != new_date: DATE = new_date print('[-] Header requested date', DATE) try: if path == '/proxy.pac': # PAC file to bypass QUICK_IMAGES requests pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n''' pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n''' pac += b'''\r\n''' pac += b'''function FindProxyForURL(url, host)\r\n''' pac += b'''{\r\n''' pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*"))\r\n''' pac += b''' {\r\n''' pac += b''' return "DIRECT";\r\n''' pac += b''' }\r\n''' pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n''' pac += b'''}\r\n''' self.request.sendall(pac) return elif hostname == 'web.archive.org': if path[:5] != '/web/': # launch settings return self.handle_settings(parsed.query) else: # pass-through requests to web.archive.org # required for QUICK_IMAGES _print('[>] [QI] {0}'.format('/'.join(request_url.split('/')[5:]))) conn = urllib.request.urlopen(request_url) elif GEOCITIES_FIX and hostname == 'www.geocities.com': # apply GEOCITIES_FIX and pass it through split = request_url.split('/') hostname = split[2] = 'www.oocities.org' request_url = '/'.join(split) _print('[>] {0}'.format(request_url)) conn = urllib.request.urlopen(request_url) else: # get from Wayback _print('[>] {0}'.format(request_url)) conn = urllib.request.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url)) except urllib.error.HTTPError as e: # an error has been found # 403 or 404 => heuristically determine the static URL for some redirect scripts if e.code in (403, 404): match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)(?:url)?=(http[^&]+)''', request_url, re.IGNORECASE) if match: # we found it new_url = urllib.parse.unquote_plus(match.group(1)) _print('[r]', new_url) return self.redirect_page(http_version, new_url) _print('[!] {0} {1}'.format(e.code, e.reason)) return self.error_page(http_version, e.code, e.reason) # get content type content_type = conn.info().get('Content-Type') if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')] # send headers self.request.sendall(http_version.encode('ascii', 'ignore') + b' 200 OK\r\nContent-Type: ' + content_type.encode('ascii', 'ignore') + b'\r\n\r\n') # set the mode: [0]wayback [1]oocities mode = 0 if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1 if content_type[:9] == 'text/html' in content_type: # HTML toolbar = mode == 1 # oocities header starts without warning redirect_page = False for line in conn: line = line.rstrip(b'\r\n') if mode == 0: if toolbar: for delimiter in (b'<\!-- END WAYBACK TOOLBAR INSERT -->', b'<\!-- End Wayback Rewrite JS Include -->'): if re.search(delimiter, line): # toolbar is done - resume relaying on the next line toolbar = False line = re.sub(delimiter, b'', line) break if toolbar: continue elif redirect_page: # this is a really bad way to deal with Wayback's 302 # pages, but necessary with the way this proxy works match = re.search(b'
', line) if match: line = b'