| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742 |
- #!/usr/bin/env python3
- import base64, collections, datetime, json, re, socket, socketserver, string, sys, threading, time, traceback, urllib.parse
- try:
- import urllib3
- except ImportError:
- print('WaybackProxy now requires urllib3 to be installed. Follow setup step 3 on the readme to fix this.')
- sys.exit(1)
- from config_handler import *
- class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
- """TCPServer with ThreadingMixIn added."""
- pass
- # http://code.activestate.com/recipes/580644-lru-dictionary/
- class LRUDict(collections.OrderedDict):
- '''An dict that can discard least-recently-used items, either by maximum capacity
- or by time to live.
- An item's ttl is refreshed (aka the item is considered "used") by direct access
- via [] or get() only, not via iterating over the whole collection with items()
- for example.
- Expired entries only get purged after insertions or changes. Either call purge()
- manually or check an item's ttl with ttl() if that's unacceptable.
- '''
- def __init__(self, *args, maxduration=None, maxsize=128, **kwargs):
- '''Same arguments as OrderedDict with these 2 additions:
- maxduration: number of seconds entries are kept. 0 or None means no timelimit.
- maxsize: maximum number of entries being kept.'''
- super().__init__(*args, **kwargs)
- self.maxduration = maxduration
- self.maxsize = maxsize
- self.purge()
- def purge(self):
- '''Removes expired or overflowing entries.'''
- if self.maxsize:
- # pop until maximum capacity is reached
- overflowing = max(0, len(self) - self.maxsize)
- for _ in range(overflowing):
- self.popitem(last=False)
- if self.maxduration:
- # expiration limit
- limit = time.time() - self.maxduration
- # as long as there are still items in the dictionary
- while self:
- # look at the oldest (front)
- _, lru = next(iter(super().values()))
- # if it is within the timelimit, we're fine
- if lru > limit:
- break
- # otherwise continue to pop the front
- self.popitem(last=False)
- def __getitem__(self, key):
- # retrieve item
- value = super().__getitem__(key)[0]
- # update lru time
- super().__setitem__(key, (value, time.time()))
- self.move_to_end(key)
- return value
- def get(self, key, default=None):
- try:
- return self[key]
- except KeyError:
- return default
- def ttl(self, key):
- '''Returns the number of seconds this item will live.
- The item might still be deleted if maxsize is reached.
- The time to live can be negative, as for expired items
- that have not been purged yet.'''
- if self.maxduration:
- lru = super().__getitem__(key)[1]
- return self.maxduration - (time.time() - lru)
- def __setitem__(self, key, value):
- super().__setitem__(key, (value, time.time()))
- self.purge()
-
- def items(self):
- # remove ttl from values
- return ((k, v) for k, (v, _) in super().items())
-
- def values(self):
- # remove ttl from values
- return (v for v, _ in super().values())
- class SharedState:
- """Class for storing shared state across instances of Handler."""
- def __init__(self):
- # Create urllib3 connection pool.
- self.http = urllib3.PoolManager(maxsize=4, block=True)
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- # Create internal LRU dictionary for preserving URLs on redirect.
- self.date_cache = LRUDict(maxduration=86400, maxsize=1024)
- # Create internal LRU dictionary for date availability.
- self.availability_cache = LRUDict(maxduration=86400, maxsize=1024)
- # Read domain whitelist file.
- try:
- with open('whitelist.txt', 'r') as f:
- self.whitelist = f.read().splitlines()
- except:
- self.whitelist = []
- shared_state = SharedState()
- class Handler(socketserver.BaseRequestHandler):
- """Main request handler."""
- def setup(self, *args, **kwargs):
- """Set up this instance of Handler."""
- super().setup(*args, **kwargs)
- # Store a local pointer to SharedState.
- self.shared_state = shared_state
- def handle(self):
- """Handle a request."""
- # readline is pretty convenient
- f = self.request.makefile()
- # read request line
- reqline = line = f.readline()
- split = line.rstrip().split(' ')
- http_version = len(split) > 2 and split[2].upper() or 'HTTP/0.9'
- if len(split) < 2 or split[0].upper() != 'GET':
- # only GET is implemented
- return self.send_error_page(http_version, 501, 'Not Implemented', extra=split[0])
- # read out the headers
- request_host = None
- pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
- effective_date = DATE
- auth = None
- while line.strip() != '':
- line = f.readline()
- ll = line.lower()
- if ll[:6] == 'host: ':
- pac_host = request_host = line[6:].rstrip()
- if ':' not in pac_host: # explicitly specify port if running on port 80
- pac_host += ':80'
- elif ll[:21] == 'x-waybackproxy-date: ':
- # API for a personal project of mine
- effective_date = line[21:].rstrip()
- elif ll[:21] == 'authorization: basic ':
- # asset date code passed as username:password
- auth = base64.b64decode(ll[21:])
- # parse the URL
- pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da')
- if split[1][0] == '/' and split[1] not in pac_file_paths:
- # just a path (not corresponding to a PAC file) => transparent proxy
- # Host header and therefore HTTP/1.1 are required
- if not request_host:
- return self.send_error_page(http_version, 400, 'Host header missing')
- archived_url = 'http://' + request_host + split[1]
- else:
- # full URL => explicit proxy
- archived_url = split[1]
- request_url = archived_url
- parsed = urllib.parse.urlparse(request_url)
- # make a path
- path = parsed.path
- if parsed.query:
- path += '?' + parsed.query
- elif path == '':
- path == '/'
- # get the hostname for later
- host = parsed.netloc.split(':')
- hostname = host[0]
- # get cached date for redirects, if available
- original_date = effective_date
- effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date)
- # get date from username:password, if available
- if auth:
- effective_date = auth.replace(':', '')
- # Effectively handle the request.
- try:
- if path in pac_file_paths:
- # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled.
- pac = http_version + ''' 200 OK\r\n'''
- pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
- pac += '''\r\n'''
- pac += '''function FindProxyForURL(url, host)\r\n'''
- pac += '''{\r\n'''
- if not WAYBACK_API:
- pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
- pac += ''' {\r\n'''
- pac += ''' return "DIRECT";\r\n'''
- pac += ''' }\r\n'''
- pac += ''' return "PROXY ''' + pac_host + '''";\r\n'''
- pac += '''}\r\n'''
- self.request.sendall(pac.encode('ascii', 'ignore'))
- return
- elif hostname in self.shared_state.whitelist:
- _print('[>] [byp]', archived_url)
- elif hostname == 'web.archive.org':
- if path[:5] != '/web/':
- # Launch settings if enabled.
- if SETTINGS_PAGE:
- return self.handle_settings(parsed.query)
- else:
- return self.send_error_page(http_version, 404, 'Not Found')
- else:
- # Pass requests through to web.archive.org. Required for QUICK_IMAGES.
- split = request_url.split('/')
- effective_date = split[4]
- archived_url = '/'.join(split[5:])
- _print('[>] [QI]', archived_url)
- elif GEOCITIES_FIX and hostname == 'www.geocities.com':
- # Apply GEOCITIES_FIX and pass it through.
- _print('[>]', archived_url)
- split = archived_url.split('/')
- hostname = split[2] = 'www.oocities.org'
- request_url = '/'.join(split)
- else:
- # Get from the Wayback Machine.
- _print('[>]', archived_url)
- request_url = 'https://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url)
- # Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors.
- split = request_url.split('/')
- if split[2] == 'web.archive.org':
- # Remove extraneous :80 from URL.
- if ':' in split[5]:
- if split[7][-3:] == ':80':
- split[7] = split[7][:-3]
- elif split[5][-3:] == ':80':
- split[5] = split[5][:-3]
- # Check availability LRU cache.
- availability_url = '/'.join(split[5:])
- new_url = self.shared_state.availability_cache.get(availability_url, None)
- if new_url:
- # In cache => replace URL immediately.
- request_url = new_url
- elif WAYBACK_API:
- # Not in cache => contact API.
- try:
- availability_endpoint = 'https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '×tamp=' + effective_date[:14]
- availability = json.loads(self.shared_state.http.request('GET', availability_endpoint, timeout=10, retries=1).data)
- closest = availability.get('archived_snapshots', {}).get('closest', {})
- new_date = closest.get('timestamp', None)
- except:
- _print('[!] Failed to fetch Wayback availability data')
- new_date = None
- if new_date and new_date != effective_date[:14]:
- # Returned date is different.
- new_url = closest['url']
- # Add asset tag to the date.
- split = new_url.split('/')
- if len(effective_date) > 14:
- split[4] += effective_date[14:]
- else:
- split[4] += 'if_'
- new_url = '/'.join(split)
- # Replace URL and add it to the availability cache.
- request_url = self.shared_state.availability_cache[availability_url] = new_url
- # Start fetching the URL.
- retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1)
- while True:
- conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False)
- # Check for redirects.
- destination = conn.get_redirect_location()
- if destination:
- self.drain_conn(conn)
- conn.release_conn()
- # Check if the redirect goes to a different Wayback URL.
- match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination)
- if match:
- archived_dest = self.sanitize_redirect(match.group(2))
- # Check if the archived URL is different.
- if archived_dest != archived_url:
- # Remove extraneous :80 from URL.
- archived_dest = re.sub('''^([^/]*//[^/]+):80''', '\\1', archived_dest)
- # Add destination to availability cache and redirect the client.
- _print('[r]', archived_dest)
- self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest
- return self.send_redirect_page(http_version, archived_dest, conn.status)
- # Not an archived URL or same URL, redirect ourselves.
- request_url = destination
- continue
- # Wayback will add its JavaScript to anything it thinks is JavaScript.
- # If this is detected, redirect ourselves through the raw asset interface.
- content_type = conn.headers.get('Content-Type')
- guessed_content_type = conn.headers.get('X-Archive-Guessed-Content-Type')
- if not guessed_content_type:
- guessed_content_type = content_type
- if 'javascript' in guessed_content_type:
- match = re.match('''(https?://web\\.archive\\.org/web/[0-9]+)([^/]*)(.+)''', request_url)
- if match and match.group(2) != 'im_':
- self.drain_conn(conn)
- conn.release_conn()
- request_url = match.group(1) + 'im_' + match.group(3)
- continue
- # This request can proceed.
- break
- except urllib3.exceptions.MaxRetryError as e:
- _print('[!] Fetch retries exceeded:', e.reason)
- return self.send_error_page(http_version, 504, 'Gateway Timeout')
- except:
- # Some other fetch exception has occurred.
- _print('[!] Fetch exception:')
- traceback.print_exc()
- return self.send_error_page(http_version, 502, 'Bad Gateway')
- # Check for HTTP errors.
- if conn.status != 200:
- if conn.status in (403, 404): # not found
- if self.guess_and_send_redirect(http_version, archived_url):
- self.drain_conn(conn)
- conn.release_conn()
- return
- #elif conn.status in (301, 302): # redirect loop detection currently unused
- # self.drain_conn(conn)
- # conn.release_conn()
- # return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
- if conn.status != 412: # tolerance exceeded has its own error message above
- _print('[!]', conn.status, conn.reason)
- # If the memento Link header is present, this is a website error
- # instead of a Wayback error. Pass it along if that's the case.
- if 'Link' not in conn.headers:
- self.drain_conn(conn)
- conn.release_conn()
- return self.send_error_page(http_version, conn.status, conn.reason)
- # Adjust content type.
- if content_type == None:
- content_type = 'text/html'
- elif not CONTENT_TYPE_ENCODING:
- idx = content_type.find(';')
- if idx > -1:
- content_type = content_type[:idx]
- # Set the archive mode.
- if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
- mode = 1 # oocities
- else:
- mode = 0 # Wayback Machine
- # Check content type to determine if this is HTML we need to patch.
- # Wayback will add its HTML to anything it thinks is HTML.
- if 'text/html' in guessed_content_type:
- # Some dynamically-generated links may end up pointing to
- # web.archive.org. Correct that by redirecting the Wayback
- # portion of the URL away if it ends up being HTML consumed
- # through the QUICK_IMAGES interface.
- if hostname == 'web.archive.org':
- self.drain_conn(conn)
- conn.release_conn()
- archived_url = '/'.join(request_url.split('/')[5:])
- _print('[r] [QI]', archived_url)
- return self.send_redirect_page(http_version, archived_url, 301)
- # Check if the date is within tolerance.
- if DATE_TOLERANCE != None:
- match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '')
- if match:
- requested_date = match.group(1)
- if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
- self.drain_conn(conn)
- conn.release_conn()
- _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
- if not self.guess_and_send_redirect(http_version, archived_url):
- self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
- return
- # Consume all data.
- data = conn.read()
- conn.release_conn()
- # Patch the page.
- if mode == 0: # Wayback Machine
- # Check if this is a Wayback Machine page.
- if b'<title>Wayback Machine</title>' in data:
- # Check if this is an exclusion (robots.txt?) error page.
- if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data:
- return self.send_error_page(http_version, 403, 'URL excluded')
- # Check if this is a media playback iframe page.
- # Some websites (especially ones that use frames)
- # inexplicably render inside a media playback iframe.
- # In that case, a simple redirect would result in a
- # redirect loop, so fetch and render the URL instead.
- match = re.search(b'''<iframe id="playback" src="((?:(?:https?:)?//web.archive.org)?/web/[^"]+)"''', data)
- if match:
- # Extract the content URL.
- request_url = match.group(1).decode('ascii', 'ignore')
- archived_url = '/'.join(request_url.split('/')[5:])
- # Start fetching the URL.
- _print('[f]', archived_url)
- conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
- if conn.status != 200:
- _print('[!]', conn.status, conn.reason)
- # If the memento Link header is present, this is a website error
- # instead of a Wayback error. Pass it along if that's the case.
- if 'Link' not in conn.headers:
- self.drain_conn(conn)
- conn.release_conn()
- return self.send_error_page(http_version, conn.status, conn.reason)
- # Identify content type so we don't modify non-HTML content.
- content_type = conn.headers.get('Content-Type')
- if not CONTENT_TYPE_ENCODING:
- idx = content_type.find(';')
- if idx > -1:
- content_type = content_type[:idx]
- if 'text/html' in content_type:
- # Consume all data and proceed with patching the page.
- data = conn.read()
- conn.release_conn()
- else:
- # Pass non-HTML data through.
- return self.send_passthrough(conn, http_version, content_type, request_url)
- # Check if this is a Wayback Machine redirect page.
- if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
- match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
- if match:
- # Sanitize the URL.
- archived_url = self.sanitize_redirect(match.group(2).decode('ascii', 'ignore'))
- # Add URL to the date LRU cache.
- self.shared_state.date_cache[str(effective_date) + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
- # Get the original HTTP redirect code.
- match = re.search(b'''<p class="code shift red">Got an HTTP ([0-9]+)''', data)
- try:
- redirect_code = int(match.group(1))
- except:
- redirect_code = 302
- # Redirect client to the URL.
- _print('[r]', archived_url)
- return self.send_redirect_page(http_version, archived_url, redirect_code)
- # Remove pre-toolbar scripts and CSS.
- data = re.sub(b'''(?:<!-- is_embed=True -->\\r?\\n?)?<script (?:type="text/javascript" )?src="[^"]*/_static/js/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, count=1, flags=re.S)
- # Remove toolbar. The if_ asset tag serves no toolbar, but we remove it just in case.
- data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, count=1, flags=re.S)
- # Remove comments on footer.
- data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
- # Fix base tag.
- data = re.sub(b'''(<base\\s+[^>]*href=["']?)(?:(?:https?:)?//web.archive.org)?/web/[^/]+/(?:[^:/]+://)?''', b'\\1http://', data, flags=re.I + re.S)
- # Remove extraneous :80 from links.
- data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^/:]+)://([^/:]+):80/', b'\\1\\2/\\3://\\4/', data)
- # Fix links.
- if QUICK_IMAGES:
- # QUICK_IMAGES works by intercepting asset URLs (those
- # with a date code ending in im_, js_...) and letting the
- # proxy pass them through. This may reduce load time
- # because Wayback doesn't have to hunt down the closest
- # copy of that asset to DATE, as those URLs have specific
- # date codes. This taints the HTML with web.archive.org
- # URLs. QUICK_IMAGES=2 uses the original URLs with an added
- # username:password, which taints less but is not supported
- # by all browsers - IE notably kills the whole page if it
- # sees an iframe pointing to an invalid URL.
- def filter_asset(match):
- if match.group(2) in (None, b'if_', b'fw_'): # non-asset URL
- return match.group(3) == b'https://' and b'http://' or match.group(3) # convert secure non-asset URLs to regular HTTP
- asset_type = match.group(2)
- if asset_type == b'js_': # cut down on the JavaScript detector's second request
- asset_type = b'im_'
- if QUICK_IMAGES == 2:
- return b'http://' + match.group(1) + b':' + asset_type + b'@'
- else:
- return b'http://web.archive.org/web/' + match.group(1) + asset_type + b'/' + match.group(3)
- data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)?/([^:/]+:(?://)?)', filter_asset, data)
- else:
- # Remove asset URLs while simultaneously adding them to the date LRU cache
- # with their respective date and converting secure URLs to regular HTTP.
- def add_to_date_cache(match):
- orig_url = match.group(2)
- if orig_url[:8] == b'https://':
- orig_url = b'http://' + orig_url[8:]
- self.shared_state.date_cache[str(effective_date) + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore').replace('js_', 'im_')
- return orig_url
- data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
- elif mode == 1: # oocities
- # Remove viewport/cache-control/max-width code from the header.
- data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
- # Remove archive notice and tracking code from the footer.
- data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
- # Remove clearly labeled snippets from Geocities.
- data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
- data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
- data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
- # Fix links.
- data = re.sub(b'''//([^\\.]*\\.)?oocities\\.com/''', b'//\\1geocities.com/', data, flags=re.S)
- # Send patched page.
- self.send_response_headers(conn, http_version, content_type, request_url, content_length=len(data))
- self.request.sendall(data)
- self.request.close()
- else:
- # Pass non-HTML data through.
- self.send_passthrough(conn, http_version, content_type, request_url)
- def send_passthrough(self, conn, http_version, content_type, request_url):
- """Pass data through to the client unmodified (save for our headers)."""
- self.send_response_headers(conn, http_version, content_type, request_url, content_length=True)
- for data in conn.stream(1024):
- self.request.sendall(data)
- conn.release_conn()
- self.request.close()
- def send_response_headers(self, conn, http_version, content_type, request_url, content_length=False):
- """Generate and send the response headers."""
- # Pass the HTTP version, and error code if there is one.
- response = '{0} {1} {2}'.format(http_version, conn.status, conn.reason.replace('\n', ' '))
- # Add Content-Type, Content-Length and the caching ETag.
- response += '\r\nContent-Type: ' + content_type
- if type(content_length) == int:
- response += '\r\nContent-Length: ' + str(content_length)
- content_length = False # don't pass the original length through
- response += '\r\nETag: "' + request_url.replace('"', '') + '"'
- response += '\r\nConnection: close' # helps with IE6 trying to use proxy keep alive and holding half-open connections
- # Pass X-Archive-Orig-* (and Content-Length if requested) headers through.
- for header in conn.headers:
- if header.find('X-Archive-Orig-') == 0:
- orig_header = header[15:]
- # Skip headers which may affect client behavior.
- if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
- response += '\r\n' + orig_header + ': ' + conn.headers[header]
- elif content_length and header.lower() == 'content-length':
- response += '\r\n' + header + ': ' + conn.headers[header]
- # Finish and send the request.
- response += '\r\n\r\n'
- self.request.sendall(response.encode('utf8', 'ignore'))
- def send_error_page(self, http_version, code, reason, extra=''):
- """Generate an error page."""
- # Get a description for this error code.
- if code in (404, 508): # page not archived or redirect loop
- description = 'This page may not be archived by the Wayback Machine.'
- elif code == 403: # not crawled due to exclusion
- description = 'This page was not archived due to a Wayback Machine exclusion.'
- elif code == 501: # method not implemented
- description = 'WaybackProxy only implements the GET method. Your browser sent a request with the {0} method.'.format(extra.upper())
- elif code == 502: # exception
- description = 'This page could not be fetched due to an unknown error.'
- elif code == 504: # timeout
- description = 'This page could not be fetched due to a Wayback Machine server error.'
- elif code == 412: # outside of tolerance
- description = 'The earliest snapshot for this page is outside of the configured tolerance interval.'
- elif code == 400 and reason == 'Host header missing': # no host header in transparent mode
- description = 'WaybackProxy\'s transparent mode requires an HTTP/1.1-compliant client.'
- else: # another error
- description = 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
- # Read error page file.
- try:
- with open('error.html', 'r', encoding='utf8', errors='ignore') as f:
- error_page = f.read()
- except:
- # Just send the code and reason as a backup.
- error_page = '${code} ${reason}'
- # Format error page template.
- signature = self.signature()
- error_page = string.Template(error_page).substitute(**locals()).encode('utf8', 'ignore')
- error_page_len = len(error_page)
- # Send formatted error page and stop.
- self.request.sendall(
- '{http_version} {code} {reason}\r\n'
- 'Content-Type: text/html\r\n'
- 'Content-Length: {error_page_len}\r\n'
- '\r\n'
- .format(**locals()).encode('utf8', 'ignore')
- )
- self.request.sendall(error_page)
- self.request.close()
- def send_redirect_page(self, http_version, target, code=302):
- """Generate a redirect page."""
- # Make redirect page.
- redirect_page = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url=${target}"></head><body><p>If you are not redirected, <a href="${target}">click here</a>.</p></body></html>'
- redirect_page = string.Template(redirect_page).substitute(**locals()).encode('utf8', 'ignore')
- redirect_page_len = len(redirect_page)
- # Send redirect page and stop.
- self.request.sendall(
- '{http_version} {code} Found\r\n'
- 'Location: {target}\r\n'
- 'Content-Type: text/html\r\n'
- 'Content-Length: {redirect_page_len}\r\n'
- '\r\n'
- .format(**locals()).encode('utf8', 'ignore')
- )
- self.request.sendall(redirect_page)
- self.request.close()
- def guess_and_send_redirect(self, http_version, guess_url):
- # Heuristically determine the static URL for some redirect scripts.
- parsed = urllib.parse.urlparse(guess_url)
- match = re.search('''(?:^|&)[^=]+=((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', parsed.query, re.I) # URL in query parameters
- if not match:
- full_path = parsed.path
- if parsed.query:
- full_path += '?' + parsed.query
- match = re.search('''((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:(?:%2F|/).+|$))''', full_path, re.I) # URL in path or full query
- if match: # found URL
- # Decode and sanitize the URL.
- new_url = self.sanitize_redirect(urllib.parse.unquote_plus(match.group(1)))
- # Redirect client to the URL.
- _print('[r] [g]', new_url)
- self.send_redirect_page(http_version, new_url)
- return True
- return False
- def handle_settings(self, query):
- """Generate the settings page."""
- global DATE, DATE_TOLERANCE, GEOCITIES_FIX, QUICK_IMAGES, WAYBACK_API, CONTENT_TYPE_ENCODING, SILENT, SETTINGS_PAGE
- if query != '': # handle any parameters that may have been sent
- parsed = urllib.parse.parse_qs(query)
- if 'date' in parsed and 'dateTolerance' in parsed:
- if DATE != parsed['date'][0]:
- DATE = parsed['date'][0]
- self.shared_state.date_cache.clear()
- self.shared_state.availability_cache.clear()
- if DATE_TOLERANCE != parsed['dateTolerance'][0]:
- DATE_TOLERANCE = parsed['dateTolerance'][0]
- GEOCITIES_FIX = 'gcFix' in parsed
- QUICK_IMAGES = 'quickImages' in parsed
- CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
- # send the page and stop
- settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
- settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
- settingspage += self.signature()
- settingspage += '</b></p><form method="get" action="/">'
- settingspage += '<p>Date to get pages from: <input type="text" name="date" size="8" value="'
- settingspage += str(DATE)
- settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
- settingspage += str(DATE_TOLERANCE)
- settingspage += '"> days<br><input type="checkbox" name="gcFix"'
- if GEOCITIES_FIX:
- settingspage += ' checked'
- settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
- if QUICK_IMAGES:
- settingspage += ' checked'
- settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
- if CONTENT_TYPE_ENCODING:
- settingspage += ' checked'
- settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
- self.request.send(settingspage.encode('utf8', 'ignore'))
- self.request.close()
- def sanitize_redirect(self, url):
- """Sanitize an URL for client-side redirection."""
- if url[0] != '/' and '://' not in url:
- # Add protocol if the URL is absolute but missing a protocol.
- return 'http://' + url
- elif url[:8].lower() == 'https://':
- # Convert secure URLs to regular HTTP.
- return 'http://' + url[8:]
- else:
- # No changes required.
- return url
- def signature(self):
- """Return the server signature."""
- return 'WaybackProxy on {0}'.format(socket.gethostname())
- def wayback_to_datetime(self, date):
- """Convert a Wayback format date string to a datetime.datetime object."""
- date = str(date)
- fmt = '%Y%m%d%H%M%S'
- fmt_len = 14
- while fmt:
- try:
- return datetime.datetime.strptime(date[:fmt_len], fmt)
- except:
- fmt = fmt[:-2]
- fmt_len -= 2
- def drain_conn(self, conn):
- getattr(conn, 'drain_conn', conn.read)()
- print_lock = threading.Lock()
- def _print(*args, **kwargs):
- """Logging function."""
- if SILENT:
- return
- with print_lock:
- print(*args, **kwargs, flush=True)
- def main():
- """Starts the server."""
- server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
- _print('[-] Now listening on port', LISTEN_PORT)
- _print('[-] Date set to', DATE)
- try:
- server.serve_forever()
- except KeyboardInterrupt: # Ctrl+C to stop
- pass
- if __name__ == '__main__':
- main()
|