#!/usr/bin/env python import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse from config import * class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCPServer with ThreadingMixIn added.""" pass class Handler(socketserver.BaseRequestHandler): """Main request handler.""" def handle(self): """Handle a request.""" global DATE # readline is pretty convenient f = self.request.makefile() # read request line reqline = line = f.readline() split = line.rstrip('\r\n').split(' ') http_version = len(split) > 2 and split[2] or 'HTTP/0.9' if split[0] != 'GET': # only GET is implemented return self.error_page(http_version, 501, 'Not Implemented') # parse the URL request_url = archived_url = split[1] parsed = urllib.parse.urlparse(request_url) # make a path path = parsed.path if parsed.query != '': path += '?' + parsed.query if path == '': path == '/' # get the hostname for later host = parsed.netloc.split(':') hostname = host[0] # read out the headers, saving the PAC file host pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work auth = None while line.rstrip('\r\n') != '': line = f.readline() ll = line.lower() if ll[:6] == 'host: ': pac_host = line[6:].rstrip('\r\n') if ':' not in pac_host: # who would run this on port 80 anyway? pac_host += ':80' elif ll[:21] == 'x-waybackproxy-date: ': # API for a personal project of mine new_date = line[21:].rstrip('\r\n') if DATE != new_date: DATE = new_date print('[-] Header requested date', DATE) elif ll[:21] == 'authorization: basic ': # asset date code passed as username:password auth = base64.b64decode(ll[21:]) try: if path == '/proxy.pac': # PAC file to bypass QUICK_IMAGES requests pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n''' pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n''' pac += b'''\r\n''' pac += b'''function FindProxyForURL(url, host)\r\n''' pac += b'''{\r\n''' pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*"))\r\n''' pac += b''' {\r\n''' pac += b''' return "DIRECT";\r\n''' pac += b''' }\r\n''' pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n''' pac += b'''}\r\n''' self.request.sendall(pac) return elif hostname == 'web.archive.org' or auth: if path[:5] != '/web/': # launch settings return self.handle_settings(parsed.query) else: # pass-through requests to web.archive.org # required for QUICK_IMAGES # did we get an username:password with an asset date code? if auth: request_url = 'http://web.archive.org/web/{0}/{1}'.format(auth.replace(':', ''), archived_url) else: archived_url = '/'.join(request_url.split('/')[5:]) _print('[>] [QI] {0}'.format(archived_url)) try: conn = urllib.request.urlopen(request_url) except urllib.error.HTTPError as e: if e.code == 404: # Try this file on another date, might be redundant return self.redirect_page(http_version, archived_url) else: raise e elif GEOCITIES_FIX and hostname == 'www.geocities.com': # apply GEOCITIES_FIX and pass it through _print('[>] {0}'.format(archived_url)) split = archived_url.split('/') hostname = split[2] = 'www.oocities.org' request_url = '/'.join(split) conn = urllib.request.urlopen(request_url) else: # get from Wayback _print('[>] {0}'.format(archived_url)) request_url = 'http://web.archive.org/web/{0}/{1}'.format(DATE, archived_url) conn = urllib.request.urlopen(request_url) except urllib.error.HTTPError as e: # an error has been found # 403 or 404 => heuristically determine the static URL for some redirect scripts if e.code in (403, 404): match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)?(?:url)?=(http[^&]+)''', archived_url, re.IGNORECASE) if match: # we found it new_url = urllib.parse.unquote_plus(match.group(1)) _print('[r]', new_url) return self.redirect_page(http_version, new_url) _print('[!] {0} {1}'.format(e.code, e.reason)) return self.error_page(http_version, e.code, e.reason) # get content type content_type = conn.info().get('Content-Type') if content_type == None: content_type = 'text/html' if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')] # set the mode: [0]wayback [1]oocities mode = 0 if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1 if 'text/html' in content_type: # HTML # Some dynamically generated links may end up pointing to # web.archive.org. Correct that by redirecting the Wayback # portion of the URL away if it ends up being HTML consumed # through the QUICK_IMAGES interface. if hostname == 'web.archive.org': conn.close() return self.redirect_page(http_version, '/'.join(archived_url.split('/')[5:]), 301) # consume all data data = conn.read() # patch the page if mode == 0: # wayback if b'Wayback Machine' in data: match = re.search(b'