#!/usr/bin/env python3 import base64, collections, datetime, json, re, socket, socketserver, string, sys, threading, time, traceback, urllib.parse try: import urllib3 except ImportError: print('WaybackProxy now requires urllib3 to be installed. Follow setup step 3 on the readme to fix this.') sys.exit(1) from config_handler import * class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCPServer with ThreadingMixIn added.""" pass # http://code.activestate.com/recipes/580644-lru-dictionary/ class LRUDict(collections.OrderedDict): '''An dict that can discard least-recently-used items, either by maximum capacity or by time to live. An item's ttl is refreshed (aka the item is considered "used") by direct access via [] or get() only, not via iterating over the whole collection with items() for example. Expired entries only get purged after insertions or changes. Either call purge() manually or check an item's ttl with ttl() if that's unacceptable. ''' def __init__(self, *args, maxduration=None, maxsize=128, **kwargs): '''Same arguments as OrderedDict with these 2 additions: maxduration: number of seconds entries are kept. 0 or None means no timelimit. maxsize: maximum number of entries being kept.''' super().__init__(*args, **kwargs) self.maxduration = maxduration self.maxsize = maxsize self.purge() def purge(self): '''Removes expired or overflowing entries.''' if self.maxsize: # pop until maximum capacity is reached overflowing = max(0, len(self) - self.maxsize) for _ in range(overflowing): self.popitem(last=False) if self.maxduration: # expiration limit limit = time.time() - self.maxduration # as long as there are still items in the dictionary while self: # look at the oldest (front) _, lru = next(iter(super().values())) # if it is within the timelimit, we're fine if lru > limit: break # otherwise continue to pop the front self.popitem(last=False) def __getitem__(self, key): # retrieve item value = super().__getitem__(key)[0] # update lru time super().__setitem__(key, (value, time.time())) self.move_to_end(key) return value def get(self, key, default=None): try: return self[key] except KeyError: return default def ttl(self, key): '''Returns the number of seconds this item will live. The item might still be deleted if maxsize is reached. The time to live can be negative, as for expired items that have not been purged yet.''' if self.maxduration: lru = super().__getitem__(key)[1] return self.maxduration - (time.time() - lru) def __setitem__(self, key, value): super().__setitem__(key, (value, time.time())) self.purge() def items(self): # remove ttl from values return ((k, v) for k, (v, _) in super().items()) def values(self): # remove ttl from values return (v for v, _ in super().values()) class SharedState: """Class for storing shared state across instances of Handler.""" def __init__(self): # Create urllib3 connection pool. self.http = urllib3.PoolManager(maxsize=4, block=True) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Create internal LRU dictionary for preserving URLs on redirect. self.date_cache = LRUDict(maxduration=86400, maxsize=1024) # Create internal LRU dictionary for date availability. self.availability_cache = LRUDict(maxduration=86400, maxsize=1024) # Read domain whitelist file. try: with open('whitelist.txt', 'r') as f: self.whitelist = f.read().splitlines() except: self.whitelist = [] shared_state = SharedState() class Handler(socketserver.BaseRequestHandler): """Main request handler.""" def setup(self, *args, **kwargs): """Set up this instance of Handler.""" super().setup(*args, **kwargs) # Store a local pointer to SharedState. self.shared_state = shared_state def handle(self): """Handle a request.""" # readline is pretty convenient f = self.request.makefile() # read request line reqline = line = f.readline() split = line.rstrip().split(' ') http_version = len(split) > 2 and split[2].upper() or 'HTTP/0.9' if len(split) < 2 or split[0].upper() != 'GET': # only GET is implemented return self.send_error_page(http_version, 501, 'Not Implemented', extra=split[0]) # read out the headers request_host = None pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work effective_date = DATE auth = None while line.strip() != '': line = f.readline() ll = line.lower() if ll[:6] == 'host: ': pac_host = request_host = line[6:].rstrip() if ':' not in pac_host: # explicitly specify port if running on port 80 pac_host += ':80' elif ll[:21] == 'x-waybackproxy-date: ': # API for a personal project of mine effective_date = line[21:].rstrip() elif ll[:21] == 'authorization: basic ': # asset date code passed as username:password auth = base64.b64decode(ll[21:]) # parse the URL pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da') if split[1][0] == '/' and split[1] not in pac_file_paths: # just a path (not corresponding to a PAC file) => transparent proxy # Host header and therefore HTTP/1.1 are required if not request_host: return self.send_error_page(http_version, 400, 'Host header missing') archived_url = 'http://' + request_host + split[1] else: # full URL => explicit proxy archived_url = split[1] request_url = archived_url parsed = urllib.parse.urlparse(request_url) # make a path path = parsed.path if parsed.query: path += '?' + parsed.query elif path == '': path == '/' # get the hostname for later host = parsed.netloc.split(':') hostname = host[0] # get cached date for redirects, if available original_date = effective_date effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date) # get date from username:password, if available if auth: effective_date = auth.replace(':', '') # Effectively handle the request. try: if path in pac_file_paths: # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled. pac = http_version + ''' 200 OK\r\n''' pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n''' pac += '''\r\n''' pac += '''function FindProxyForURL(url, host)\r\n''' pac += '''{\r\n''' if not WAYBACK_API: pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n''' pac += ''' {\r\n''' pac += ''' return "DIRECT";\r\n''' pac += ''' }\r\n''' pac += ''' return "PROXY ''' + pac_host + '''";\r\n''' pac += '''}\r\n''' self.request.sendall(pac.encode('ascii', 'ignore')) return elif hostname in self.shared_state.whitelist: _print('[>] [byp]', archived_url) elif hostname == 'web.archive.org': if path[:5] != '/web/': # Launch settings if enabled. if SETTINGS_PAGE: return self.handle_settings(parsed.query) else: return self.send_error_page(http_version, 404, 'Not Found') else: # Pass requests through to web.archive.org. Required for QUICK_IMAGES. split = request_url.split('/') effective_date = split[4] archived_url = '/'.join(split[5:]) _print('[>] [QI]', archived_url) elif GEOCITIES_FIX and hostname == 'www.geocities.com': # Apply GEOCITIES_FIX and pass it through. _print('[>]', archived_url) split = archived_url.split('/') hostname = split[2] = 'www.oocities.org' request_url = '/'.join(split) else: # Get from the Wayback Machine. _print('[>]', archived_url) request_url = 'https://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url) # Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors. split = request_url.split('/') if split[2] == 'web.archive.org': # Remove extraneous :80 from URL. if ':' in split[5]: if split[7][-3:] == ':80': split[7] = split[7][:-3] elif split[5][-3:] == ':80': split[5] = split[5][:-3] # Check availability LRU cache. availability_url = '/'.join(split[5:]) new_url = self.shared_state.availability_cache.get(availability_url, None) if new_url: # In cache => replace URL immediately. request_url = new_url elif WAYBACK_API: # Not in cache => contact API. try: availability_endpoint = 'https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '×tamp=' + effective_date[:14] availability = json.loads(self.shared_state.http.request('GET', availability_endpoint, timeout=10, retries=1).data) closest = availability.get('archived_snapshots', {}).get('closest', {}) new_date = closest.get('timestamp', None) except: _print('[!] Failed to fetch Wayback availability data') new_date = None if new_date and new_date != effective_date[:14]: # Returned date is different. new_url = closest['url'] # Add asset tag to the date. split = new_url.split('/') if len(effective_date) > 14: split[4] += effective_date[14:] else: split[4] += 'if_' new_url = '/'.join(split) # Replace URL and add it to the availability cache. request_url = self.shared_state.availability_cache[availability_url] = new_url # Start fetching the URL. retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1) while True: conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False) # Check for redirects. destination = conn.get_redirect_location() if destination: self.drain_conn(conn) conn.release_conn() # Check if the redirect goes to a different Wayback URL. match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination) if match: archived_dest = self.sanitize_redirect(match.group(2)) # Check if the archived URL is different. if archived_dest != archived_url: # Remove extraneous :80 from URL. archived_dest = re.sub('''^([^/]*//[^/]+):80''', '\\1', archived_dest) # Add destination to availability cache and redirect the client. _print('[r]', archived_dest) self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest return self.send_redirect_page(http_version, archived_dest, conn.status) # Not an archived URL or same URL, redirect ourselves. request_url = destination continue # Wayback will add its JavaScript to anything it thinks is JavaScript. # If this is detected, redirect ourselves through the raw asset interface. content_type = conn.headers.get('Content-Type') guessed_content_type = conn.headers.get('X-Archive-Guessed-Content-Type') if not guessed_content_type: guessed_content_type = content_type if 'javascript' in guessed_content_type: match = re.match('''(https?://web\\.archive\\.org/web/[0-9]+)([^/]*)(.+)''', request_url) if match and match.group(2) != 'im_': self.drain_conn(conn) conn.release_conn() request_url = match.group(1) + 'im_' + match.group(3) continue # This request can proceed. break except urllib3.exceptions.MaxRetryError as e: _print('[!] Fetch retries exceeded:', e.reason) return self.send_error_page(http_version, 504, 'Gateway Timeout') except: # Some other fetch exception has occurred. _print('[!] Fetch exception:') traceback.print_exc() return self.send_error_page(http_version, 502, 'Bad Gateway') # Check for HTTP errors. if conn.status != 200: if conn.status in (403, 404): # not found if self.guess_and_send_redirect(http_version, archived_url): self.drain_conn(conn) conn.release_conn() return #elif conn.status in (301, 302): # redirect loop detection currently unused # self.drain_conn(conn) # conn.release_conn() # return self.send_error_page(http_version, 508, 'Infinite Redirect Loop') if conn.status != 412: # tolerance exceeded has its own error message above _print('[!]', conn.status, conn.reason) # If the memento Link header is present, this is a website error # instead of a Wayback error. Pass it along if that's the case. if 'Link' not in conn.headers: self.drain_conn(conn) conn.release_conn() return self.send_error_page(http_version, conn.status, conn.reason) # Adjust content type. if content_type == None: content_type = 'text/html' elif not CONTENT_TYPE_ENCODING: idx = content_type.find(';') if idx > -1: content_type = content_type[:idx] # Set the archive mode. if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'): mode = 1 # oocities else: mode = 0 # Wayback Machine # Check content type to determine if this is HTML we need to patch. # Wayback will add its HTML to anything it thinks is HTML. if 'text/html' in guessed_content_type: # Some dynamically-generated links may end up pointing to # web.archive.org. Correct that by redirecting the Wayback # portion of the URL away if it ends up being HTML consumed # through the QUICK_IMAGES interface. if hostname == 'web.archive.org': self.drain_conn(conn) conn.release_conn() archived_url = '/'.join(request_url.split('/')[5:]) _print('[r] [QI]', archived_url) return self.send_redirect_page(http_version, archived_url, 301) # Check if the date is within tolerance. if DATE_TOLERANCE != None: match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '') if match: requested_date = match.group(1) if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)): self.drain_conn(conn) conn.release_conn() _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days') if not self.guess_and_send_redirect(http_version, archived_url): self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available') return # Consume all data. data = conn.read() conn.release_conn() # Patch the page. if mode == 0: # Wayback Machine # Check if this is a Wayback Machine page. if b'
This URL has been excluded from the Wayback Machine.
' in data: return self.send_error_page(http_version, 403, 'URL excluded') # Check if this is a media playback iframe page. # Some websites (especially ones that use frames) # inexplicably render inside a media playback iframe. # In that case, a simple redirect would result in a # redirect loop, so fetch and render the URL instead. match = re.search(b'''