#!/usr/bin/env python3 import base64, datetime, json, lrudict, re, socket, socketserver, string, sys, threading, traceback, urllib.request, urllib.error, urllib.parse from config import * class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCPServer with ThreadingMixIn added.""" pass class SharedState: """Class for storing shared state across instances of Handler.""" def __init__(self): # Create internal LRU dictionary for preserving URLs on redirect. self.date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) # Create internal LRU dictionary for date availability. self.availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None shared_state = SharedState() class Handler(socketserver.BaseRequestHandler): """Main request handler.""" def setup(self, *args, **kwargs): """Set up this instance of Handler.""" super().setup(*args, **kwargs) # Store a local pointer to SharedState. self.shared_state = shared_state def handle(self): """Handle a request.""" # readline is pretty convenient f = self.request.makefile() # read request line reqline = line = f.readline() split = line.rstrip().split(' ') http_version = len(split) > 2 and split[2] or 'HTTP/0.9' if len(split) < 2 or split[0] != 'GET': # only GET is implemented return self.send_error_page(http_version, 501, 'Not Implemented') # read out the headers request_host = None pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work effective_date = DATE auth = None while line.strip() != '': line = f.readline() ll = line.lower() if ll[:6] == 'host: ': pac_host = request_host = line[6:].rstrip() if ':' not in pac_host: # explicitly specify port if running on port 80 pac_host += ':80' elif ll[:21] == 'x-waybackproxy-date: ': # API for a personal project of mine effective_date = line[21:].rstrip() elif ll[:21] == 'authorization: basic ': # asset date code passed as username:password auth = base64.b64decode(ll[21:]) # parse the URL pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da') if split[1][0] == '/' and split[1] not in pac_file_paths: # just a path (not corresponding to a PAC file) => transparent proxy # Host header and therefore HTTP/1.1 are required if not request_host: return self.send_error_page(http_version, 400, 'Host header missing') archived_url = 'http://' + request_host + split[1] else: # full URL => explicit proxy archived_url = split[1] request_url = archived_url parsed = urllib.parse.urlparse(request_url) # make a path path = parsed.path if parsed.query: path += '?' + parsed.query elif path == '': path == '/' # get the hostname for later host = parsed.netloc.split(':') hostname = host[0] # get cached date for redirects, if available original_date = effective_date effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date) # get date from username:password, if available if auth: effective_date = auth.replace(':', '') # Effectively handle the request. try: if path in pac_file_paths: # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled. pac = http_version + ''' 200 OK\r\n''' pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n''' pac += '''\r\n''' pac += '''function FindProxyForURL(url, host)\r\n''' pac += '''{\r\n''' if self.shared_state.availability_cache == None: pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n''' pac += ''' {\r\n''' pac += ''' return "DIRECT";\r\n''' pac += ''' }\r\n''' pac += ''' return "PROXY ''' + pac_host + '''";\r\n''' pac += '''}\r\n''' self.request.sendall(pac.encode('ascii', 'ignore')) return elif hostname == 'web.archive.org': if path[:5] != '/web/': # Launch settings if enabled. if SETTINGS_PAGE: return self.handle_settings(parsed.query) else: return self.send_error_page(http_version, 404, 'Not Found') else: # Pass requests through to web.archive.org. Required for QUICK_IMAGES. split = request_url.split('/') effective_date = split[4] archived_url = '/'.join(split[5:]) _print('[>] [QI]', archived_url) elif GEOCITIES_FIX and hostname == 'www.geocities.com': # Apply GEOCITIES_FIX and pass it through. _print('[>]', archived_url) split = archived_url.split('/') hostname = split[2] = 'www.oocities.org' request_url = '/'.join(split) else: # Get from the Wayback Machine. _print('[>]', archived_url) request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url) # Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors. if self.shared_state.availability_cache != None: # Are we requesting from the Wayback Machine? split = request_url.split('/') # If so, get the closest available date from the API. if split[2] == 'web.archive.org': # Remove extraneous :80 from URL. if ':' in split[5]: if split[7][-3:] == ':80': split[7] = split[7][:-3] elif split[5][-3:] == ':80': split[5] = split[5][:-3] # Check availability LRU cache. availability_url = '/'.join(split[5:]) new_url = self.shared_state.availability_cache.get(availability_url, None) if new_url: # In cache => replace URL immediately. request_url = new_url else: # Not in cache => contact API. try: availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '×tamp=' + effective_date[:14], timeout=10).read()) closest = availability.get('archived_snapshots', {}).get('closest', {}) new_date = closest.get('timestamp', None) except: _print('[!] Failed to fetch Wayback availability data') new_date = None if new_date and new_date != effective_date[:14]: # Returned date is different. new_url = closest['url'] # Add asset tag if one is present in the original URL. if len(effective_date) > 14: split = new_url.split('/') split[4] += effective_date[14:] new_url = '/'.join(split) # Replace URL and add it to the availability cache. request_url = self.shared_state.availability_cache[availability_url] = new_url # Start fetching the URL. conn = urllib.request.urlopen(request_url) except urllib.error.HTTPError as e: # An HTTP error has occurred. if e.code in (403, 404, 412): # not found or tolerance exceeded # Heuristically determine the static URL for some redirect scripts. parsed = urllib.parse.urlparse(archived_url) match = re.search('''(?:^|&)[^=]+=((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', parsed.query, re.I) # URL in query string if not match: match = re.search('''((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:%2F|/).+)''', parsed.path, re.I) # URL in path if match: # found URL # Decode and sanitize the URL. new_url = self.sanitize_redirect(urllib.parse.unquote_plus(match.group(1))) # Redirect client to the URL. _print('[r] [g]', new_url) return self.send_redirect_page(http_version, new_url) elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop _print('[!] Infinite redirect loop') return self.send_error_page(http_version, 508, 'Infinite Redirect Loop') if e.code != 412: # tolerance exceeded has its own error message above _print('[!]', e.code, e.reason) # If the memento Link header is present, this is a website error # instead of a Wayback error. Pass it along if that's the case. if 'Link' in e.headers: conn = e else: return self.send_error_page(http_version, e.code, e.reason) except socket.timeout as e: # A timeout has occurred. _print('[!] Fetch timeout') return self.send_error_page(http_version, 504, 'Gateway Timeout') except: # Some other fetch exception has occurred. _print('[!] Fetch exception:') traceback.print_exc() return self.send_error_page(http_version, 502, 'Bad Gateway') # Get content type. content_type = conn.info().get('Content-Type') if content_type == None: content_type = 'text/html' elif not CONTENT_TYPE_ENCODING: idx = content_type.find(';') if idx > -1: content_type = content_type[:idx] # Set the archive mode. if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'): mode = 1 # oocities else: mode = 0 # Wayback Machine # Check content type to determine if this is HTML we need to patch. # Wayback will add its HTML to anything it thinks is HTML. guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type') if not guessed_content_type: guessed_content_type = content_type if 'text/html' in guessed_content_type: # Some dynamically-generated links may end up pointing to # web.archive.org. Correct that by redirecting the Wayback # portion of the URL away if it ends up being HTML consumed # through the QUICK_IMAGES interface. if hostname == 'web.archive.org': conn.close() archived_url = '/'.join(request_url.split('/')[5:]) _print('[r] [QI]', archived_url) return self.send_redirect_page(http_version, archived_url, 301) # Check if the date is within tolerance. if DATE_TOLERANCE is not None: match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl()) if match: requested_date = match.group(1) if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)): _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days') conn.close() return self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available') # Consume all data. data = conn.read() # Patch the page. if mode == 0: # Wayback Machine # Check if this is a Wayback Machine page. if b'
This URL has been excluded from the Wayback Machine.
' in data: return self.send_error_page(http_version, 403, 'URL excluded') # Check if this is a media playback iframe page. # Some websites (especially ones that use frames) # inexplicably render inside a media playback iframe. # In that case, a simple redirect would result in a # redirect loop, so fetch and render the URL instead. match = re.search(b'''