#!/usr/bin/env python3 import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse from config import * # internal LRU dictionary for preserving URLs on redirect date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) # internal LRU dictionary for date availability availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCPServer with ThreadingMixIn added.""" pass class Handler(socketserver.BaseRequestHandler): """Main request handler.""" def handle(self): """Handle a request.""" global availability_cache # readline is pretty convenient f = self.request.makefile() # read request line reqline = line = f.readline() split = line.rstrip('\r\n').split(' ') http_version = len(split) > 2 and split[2] or 'HTTP/0.9' if split[0] != 'GET': # only GET is implemented return self.error_page(http_version, 501, 'Not Implemented') # read out the headers request_host = None pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work effective_date = DATE auth = None while line.rstrip('\r\n') != '': line = f.readline() ll = line.lower() if ll[:6] == 'host: ': pac_host = request_host = line[6:].rstrip('\r\n') if ':' not in pac_host: # explicitly specify port if running on port 80 pac_host += ':80' elif ll[:21] == 'x-waybackproxy-date: ': # API for a personal project of mine effective_date = line[21:].rstrip('\r\n') elif ll[:21] == 'authorization: basic ': # asset date code passed as username:password auth = base64.b64decode(ll[21:]) # parse the URL pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da') if split[1][0] == '/' and split[1] not in pac_file_paths: # just a path (not corresponding to a PAC file) => transparent proxy # Host header and therefore HTTP/1.1 are required if not request_host: return self.error_page(http_version, 400, 'Host header missing') archived_url = 'http://' + request_host + split[1] else: # full URL => explicit proxy archived_url = split[1] request_url = archived_url parsed = urllib.parse.urlparse(request_url) # make a path path = parsed.path if parsed.query != '': path += '?' + parsed.query if path == '': path == '/' # get the hostname for later host = parsed.netloc.split(':') hostname = host[0] # get cached date for redirects, if available original_date = effective_date effective_date = date_cache.get(str(effective_date) + '\x00' + archived_url, str(effective_date)) # get date from username:password, if available if auth: effective_date = auth.replace(':', '') # effectively handle the request try: if path in pac_file_paths: # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n''' pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n''' pac += b'''\r\n''' pac += b'''function FindProxyForURL(url, host)\r\n''' pac += b'''{\r\n''' if not availability_cache: pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n''' pac += b''' {\r\n''' pac += b''' return "DIRECT";\r\n''' pac += b''' }\r\n''' pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n''' pac += b'''}\r\n''' self.request.sendall(pac) return elif hostname == 'web.archive.org': if path[:5] != '/web/': # launch settings if enabled if SETTINGS_PAGE: return self.handle_settings(parsed.query) else: return self.error_page(http_version, 404, 'Not Found') else: # pass requests through to web.archive.org # required for QUICK_IMAGES archived_url = '/'.join(request_url.split('/')[5:]) _print('[>] [QI] {0}'.format(archived_url)) elif GEOCITIES_FIX and hostname == 'www.geocities.com': # apply GEOCITIES_FIX and pass it through _print('[>] {0}'.format(archived_url)) split = archived_url.split('/') hostname = split[2] = 'www.oocities.org' request_url = '/'.join(split) else: # get from Wayback _print('[>] {0}'.format(archived_url)) request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url) if availability_cache is not None: # are we requesting from Wayback? split = request_url.split('/') # if so, get the closest available date from Wayback's API, to avoid archived 404 pages and other site errors if split[2] == 'web.archive.org': # remove extraneous :80 from URL if ':' in split[5]: if split[7][-3:] == ':80': split[7] = split[7][:-3] elif split[5][-3:] == ':80': split[5] = split[5][:-3] # check availability LRU cache availability_url = '/'.join(split[5:]) new_url = availability_cache.get(availability_url, None) if new_url: # in cache => replace URL immediately request_url = new_url else: # not in cache => contact API try: availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '×tamp=' + effective_date[:14], timeout=10).read()) closest = availability.get('archived_snapshots', {}).get('closest', {}) new_date = closest.get('timestamp', None) except: _print('[!] Failed to fetch Wayback availability data') new_date = None if new_date and new_date != effective_date[:14]: # returned date is different new_url = closest['url'] # add asset tag if one is present in the original URL if len(effective_date) > 14: split = new_url.split('/') split[4] += effective_date[14:] new_url = '/'.join(split) # replace URL and add it to the availability cache request_url = availability[availability_url] = new_url conn = urllib.request.urlopen(request_url) except urllib.error.HTTPError as e: # an error has been found if e.code in (403, 404, 412): # 403, 404 or tolerance exceeded => heuristically determine the static URL for some redirect scripts match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\.(?:[^/%]+))(?:%2F|/).+)''', archived_url, re.I) if not match: match = re.search('''(?:\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.I) if match: # we found it new_url = urllib.parse.unquote_plus(match.group(1)) # add protocol if the URL is absolute but missing a protocol if new_url[0] != '/' and '://' not in new_url: new_url = 'http://' + new_url _print('[r]', new_url) return self.redirect_page(http_version, new_url) elif e.code in (301, 302): # 301 or 302 => urllib-generated error about an infinite redirect loop _print('[!] Infinite redirect loop') return self.error_page(http_version, 508, 'Infinite Redirect Loop') if e.code != 412: # tolerance exceeded has its own error message above _print('[!] {0} {1}'.format(e.code, e.reason)) # If the memento Link header is present, this is a website error # instead of a Wayback error. Pass it along if that's the case. if 'Link' in e.headers: conn = e else: return self.error_page(http_version, e.code, e.reason) except socket.timeout as e: _print('Timeout') except: _print('Generic exception') # get content type content_type = conn.info().get('Content-Type') if content_type == None: content_type = 'text/html' if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')] # set the mode: [0]wayback [1]oocities mode = 0 if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1 # Wayback will add its HTML to anything it thinks is HTML guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type') if not guessed_content_type: guessed_content_type = content_type if 'text/html' in guessed_content_type: # Some dynamically generated links may end up pointing to # web.archive.org. Correct that by redirecting the Wayback # portion of the URL away if it ends up being HTML consumed # through the QUICK_IMAGES interface. if hostname == 'web.archive.org': conn.close() archived_url = '/'.join(request_url.split('/')[5:]) _print('[r] [QI]', archived_url) return self.redirect_page(http_version, archived_url, 301) # check if the date is within tolerance if DATE_TOLERANCE is not None: match = re.search('''//web\.archive\.org/web/([0-9]+)''', conn.geturl()) if match: requested_date = match.group(1) if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)): _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days') conn.close() return self.error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available') # consume all data data = conn.read() # patch the page if mode == 0: # wayback if b'
This URL has been excluded from the Wayback Machine.
' in data: # exclusion error (robots.txt?) return self.error_page(http_version, 403, 'URL excluded') match = re.search(b'