waybackproxy.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. #!/usr/bin/env python3
  2. import base64, datetime, json, lrudict, re, socket, socketserver, string, sys, threading, time, traceback, urllib.parse
  3. try:
  4. import urllib3
  5. except ImportError:
  6. print('WaybackProxy now requires urllib3 to be installed. Follow setup step 3 on the readme to fix this.')
  7. sys.exit(1)
  8. from config_handler import *
  9. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  10. """TCPServer with ThreadingMixIn added."""
  11. pass
  12. class SharedState:
  13. """Class for storing shared state across instances of Handler."""
  14. def __init__(self):
  15. # Create urllib3 connection pool.
  16. self.http = urllib3.PoolManager(maxsize=4, block=True)
  17. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  18. # Create internal LRU dictionary for preserving URLs on redirect.
  19. self.date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
  20. # Create internal LRU dictionary for date availability.
  21. self.availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
  22. # Read domain whitelist file.
  23. try:
  24. with open('whitelist.txt', 'r') as f:
  25. self.whitelist = f.read().splitlines()
  26. except:
  27. self.whitelist = []
  28. shared_state = SharedState()
  29. class Handler(socketserver.BaseRequestHandler):
  30. """Main request handler."""
  31. def setup(self, *args, **kwargs):
  32. """Set up this instance of Handler."""
  33. super().setup(*args, **kwargs)
  34. # Store a local pointer to SharedState.
  35. self.shared_state = shared_state
  36. def handle(self):
  37. """Handle a request."""
  38. # readline is pretty convenient
  39. f = self.request.makefile()
  40. # read request line
  41. reqline = line = f.readline()
  42. split = line.rstrip().split(' ')
  43. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  44. if len(split) < 2 or split[0] != 'GET':
  45. # only GET is implemented
  46. return self.send_error_page(http_version, 501, 'Not Implemented')
  47. # read out the headers
  48. request_host = None
  49. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  50. effective_date = DATE
  51. auth = None
  52. while line.strip() != '':
  53. line = f.readline()
  54. ll = line.lower()
  55. if ll[:6] == 'host: ':
  56. pac_host = request_host = line[6:].rstrip()
  57. if ':' not in pac_host: # explicitly specify port if running on port 80
  58. pac_host += ':80'
  59. elif ll[:21] == 'x-waybackproxy-date: ':
  60. # API for a personal project of mine
  61. effective_date = line[21:].rstrip()
  62. elif ll[:21] == 'authorization: basic ':
  63. # asset date code passed as username:password
  64. auth = base64.b64decode(ll[21:])
  65. # parse the URL
  66. pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da')
  67. if split[1][0] == '/' and split[1] not in pac_file_paths:
  68. # just a path (not corresponding to a PAC file) => transparent proxy
  69. # Host header and therefore HTTP/1.1 are required
  70. if not request_host:
  71. return self.send_error_page(http_version, 400, 'Host header missing')
  72. archived_url = 'http://' + request_host + split[1]
  73. else:
  74. # full URL => explicit proxy
  75. archived_url = split[1]
  76. request_url = archived_url
  77. parsed = urllib.parse.urlparse(request_url)
  78. # make a path
  79. path = parsed.path
  80. if parsed.query:
  81. path += '?' + parsed.query
  82. elif path == '':
  83. path == '/'
  84. # get the hostname for later
  85. host = parsed.netloc.split(':')
  86. hostname = host[0]
  87. # get cached date for redirects, if available
  88. original_date = effective_date
  89. effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date)
  90. # get date from username:password, if available
  91. if auth:
  92. effective_date = auth.replace(':', '')
  93. # Effectively handle the request.
  94. try:
  95. if path in pac_file_paths:
  96. # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled.
  97. pac = http_version + ''' 200 OK\r\n'''
  98. pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  99. pac += '''\r\n'''
  100. pac += '''function FindProxyForURL(url, host)\r\n'''
  101. pac += '''{\r\n'''
  102. if not WAYBACK_API:
  103. pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
  104. pac += ''' {\r\n'''
  105. pac += ''' return "DIRECT";\r\n'''
  106. pac += ''' }\r\n'''
  107. pac += ''' return "PROXY ''' + pac_host + '''";\r\n'''
  108. pac += '''}\r\n'''
  109. self.request.sendall(pac.encode('ascii', 'ignore'))
  110. return
  111. elif hostname in self.shared_state.whitelist:
  112. _print('[>] [byp]', archived_url)
  113. elif hostname == 'web.archive.org':
  114. if path[:5] != '/web/':
  115. # Launch settings if enabled.
  116. if SETTINGS_PAGE:
  117. return self.handle_settings(parsed.query)
  118. else:
  119. return self.send_error_page(http_version, 404, 'Not Found')
  120. else:
  121. # Pass requests through to web.archive.org. Required for QUICK_IMAGES.
  122. split = request_url.split('/')
  123. effective_date = split[4]
  124. archived_url = '/'.join(split[5:])
  125. _print('[>] [QI]', archived_url)
  126. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  127. # Apply GEOCITIES_FIX and pass it through.
  128. _print('[>]', archived_url)
  129. split = archived_url.split('/')
  130. hostname = split[2] = 'www.oocities.org'
  131. request_url = '/'.join(split)
  132. else:
  133. # Get from the Wayback Machine.
  134. _print('[>]', archived_url)
  135. request_url = 'https://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url)
  136. # Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors.
  137. split = request_url.split('/')
  138. if split[2] == 'web.archive.org':
  139. # Remove extraneous :80 from URL.
  140. if ':' in split[5]:
  141. if split[7][-3:] == ':80':
  142. split[7] = split[7][:-3]
  143. elif split[5][-3:] == ':80':
  144. split[5] = split[5][:-3]
  145. # Check availability LRU cache.
  146. availability_url = '/'.join(split[5:])
  147. new_url = self.shared_state.availability_cache.get(availability_url, None)
  148. if new_url:
  149. # In cache => replace URL immediately.
  150. request_url = new_url
  151. elif WAYBACK_API:
  152. # Not in cache => contact API.
  153. try:
  154. availability_endpoint = 'https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14]
  155. availability = json.loads(self.shared_state.http.request('GET', availability_endpoint, timeout=10, retries=1).data)
  156. closest = availability.get('archived_snapshots', {}).get('closest', {})
  157. new_date = closest.get('timestamp', None)
  158. except:
  159. _print('[!] Failed to fetch Wayback availability data')
  160. new_date = None
  161. if new_date and new_date != effective_date[:14]:
  162. # Returned date is different.
  163. new_url = closest['url']
  164. # Add asset tag to the date.
  165. split = new_url.split('/')
  166. if len(effective_date) > 14:
  167. split[4] += effective_date[14:]
  168. else:
  169. split[4] += 'if_'
  170. new_url = '/'.join(split)
  171. # Replace URL and add it to the availability cache.
  172. request_url = self.shared_state.availability_cache[availability_url] = new_url
  173. # Start fetching the URL.
  174. retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1)
  175. while True:
  176. conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False)
  177. # Check for redirects.
  178. destination = conn.get_redirect_location()
  179. if destination:
  180. conn.drain_conn()
  181. conn.release_conn()
  182. # Check if the redirect goes to a different Wayback URL.
  183. match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination)
  184. if match:
  185. archived_dest = match.group(2)
  186. # Add missing protocol, just in case.
  187. split = archived_dest.split('/')
  188. if split[0][-1:] != ':':
  189. split = ['http:', ''] + split
  190. # Remove extraneous :80 from URL.
  191. if split[2][-3:] == ':80':
  192. split[2] = split[2][:-3]
  193. # Check if the archived URL is different.
  194. if archived_dest != archived_url:
  195. # Add destination to availability cache and redirect the client.
  196. _print('[r]', archived_dest)
  197. new_url = '/'.join(split)
  198. self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest
  199. return self.send_redirect_page(http_version, archived_dest, conn.status)
  200. # Not an archived URL or same URL, redirect ourselves.
  201. request_url = destination
  202. continue
  203. # Not a redirect, move on.
  204. break
  205. except urllib3.exceptions.MaxRetryError as e:
  206. _print('[!] Fetch retries exceeded:', e.reason)
  207. return self.send_error_page(http_version, 504, 'Gateway Timeout')
  208. except:
  209. # Some other fetch exception has occurred.
  210. _print('[!] Fetch exception:')
  211. traceback.print_exc()
  212. return self.send_error_page(http_version, 502, 'Bad Gateway')
  213. # Check for HTTP errors.
  214. if conn.status != 200:
  215. if conn.status in (403, 404): # not found
  216. if self.guess_and_send_redirect(http_version, archived_url):
  217. conn.drain_conn()
  218. conn.release_conn()
  219. return
  220. #elif conn.status in (301, 302): # redirect loop detection currently unused
  221. # conn.drain_conn()
  222. # conn.release_conn()
  223. # return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
  224. if conn.status != 412: # tolerance exceeded has its own error message above
  225. _print('[!]', conn.status, conn.reason)
  226. # If the memento Link header is present, this is a website error
  227. # instead of a Wayback error. Pass it along if that's the case.
  228. if 'Link' not in conn.headers:
  229. conn.drain_conn()
  230. conn.release_conn()
  231. return self.send_error_page(http_version, conn.status, conn.reason)
  232. # Get content type.
  233. content_type = conn.headers.get('Content-Type')
  234. if content_type == None:
  235. content_type = 'text/html'
  236. elif not CONTENT_TYPE_ENCODING:
  237. idx = content_type.find(';')
  238. if idx > -1:
  239. content_type = content_type[:idx]
  240. # Set the archive mode.
  241. if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
  242. mode = 1 # oocities
  243. else:
  244. mode = 0 # Wayback Machine
  245. # Check content type to determine if this is HTML we need to patch.
  246. # Wayback will add its HTML to anything it thinks is HTML.
  247. guessed_content_type = conn.headers.get('X-Archive-Guessed-Content-Type')
  248. if not guessed_content_type:
  249. guessed_content_type = content_type
  250. if 'text/html' in guessed_content_type:
  251. # Some dynamically-generated links may end up pointing to
  252. # web.archive.org. Correct that by redirecting the Wayback
  253. # portion of the URL away if it ends up being HTML consumed
  254. # through the QUICK_IMAGES interface.
  255. if hostname == 'web.archive.org':
  256. conn.drain_conn()
  257. conn.release_conn()
  258. archived_url = '/'.join(request_url.split('/')[5:])
  259. _print('[r] [QI]', archived_url)
  260. return self.send_redirect_page(http_version, archived_url, 301)
  261. # Check if the date is within tolerance.
  262. if DATE_TOLERANCE != None:
  263. match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '')
  264. if match:
  265. requested_date = match.group(1)
  266. if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
  267. conn.drain_conn()
  268. conn.release_conn()
  269. _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
  270. if not self.guess_and_send_redirect(http_version, archived_url):
  271. self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
  272. return
  273. # Consume all data.
  274. data = conn.read()
  275. conn.release_conn()
  276. # Patch the page.
  277. if mode == 0: # Wayback Machine
  278. # Check if this is a Wayback Machine page.
  279. if b'<title>Wayback Machine</title>' in data:
  280. # Check if this is an exclusion (robots.txt?) error page.
  281. if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data:
  282. return self.send_error_page(http_version, 403, 'URL excluded')
  283. # Check if this is a media playback iframe page.
  284. # Some websites (especially ones that use frames)
  285. # inexplicably render inside a media playback iframe.
  286. # In that case, a simple redirect would result in a
  287. # redirect loop, so fetch and render the URL instead.
  288. match = re.search(b'''<iframe id="playback" src="((?:(?:https?:)?//web.archive.org)?/web/[^"]+)"''', data)
  289. if match:
  290. # Extract the content URL.
  291. request_url = match.group(1).decode('ascii', 'ignore')
  292. archived_url = '/'.join(request_url.split('/')[5:])
  293. # Start fetching the URL.
  294. _print('[f]', archived_url)
  295. conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
  296. if conn.status != 200:
  297. _print('[!]', conn.status, conn.reason)
  298. # If the memento Link header is present, this is a website error
  299. # instead of a Wayback error. Pass it along if that's the case.
  300. if 'Link' not in conn.headers:
  301. conn.drain_conn()
  302. conn.release_conn()
  303. return self.send_error_page(http_version, conn.status, conn.reason)
  304. # Identify content type so we don't modify non-HTML content.
  305. content_type = conn.headers.get('Content-Type')
  306. if not CONTENT_TYPE_ENCODING:
  307. idx = content_type.find(';')
  308. if idx > -1:
  309. content_type = content_type[:idx]
  310. if 'text/html' in content_type:
  311. # Consume all data and proceed with patching the page.
  312. data = conn.read()
  313. conn.release_conn()
  314. else:
  315. # Pass non-HTML data through.
  316. return self.send_passthrough(conn, http_version, content_type, request_url)
  317. # Check if this is a Wayback Machine redirect page.
  318. if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
  319. match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
  320. if match:
  321. # Sanitize the URL.
  322. archived_url = self.sanitize_redirect(match.group(2).decode('ascii', 'ignore'))
  323. # Add URL to the date LRU cache.
  324. self.shared_state.date_cache[str(effective_date) + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
  325. # Get the original HTTP redirect code.
  326. match = re.search(b'''<p class="code shift red">Got an HTTP ([0-9]+)''', data)
  327. try:
  328. redirect_code = int(match.group(1))
  329. except:
  330. redirect_code = 302
  331. # Redirect client to the URL.
  332. _print('[r]', archived_url)
  333. return self.send_redirect_page(http_version, archived_url, redirect_code)
  334. # Remove pre-toolbar scripts and CSS.
  335. data = re.sub(b'''<script (?:type="text/javascript" )?src="(?:https?:)?//(?:web-static\\.)?archive\\.org/_static/js/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, count=1, flags=re.S)
  336. # Remove toolbar. The if_ asset tag serves no toolbar, but we remove it just in case.
  337. data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, count=1, flags=re.S)
  338. # Remove comments on footer.
  339. data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
  340. # Fix base tag.
  341. data = re.sub(b'''(<base\\s+[^>]*href=["']?)(?:(?:https?:)?//web.archive.org)?/web/[^/]+/(?:[^:/]+://)?''', b'\\1http://', data, flags=re.I + re.S)
  342. # Remove extraneous :80 from links.
  343. data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^/:]+)://([^/:]+):80/', b'\\1\\2/\\3://\\4/', data)
  344. # Fix links.
  345. if QUICK_IMAGES:
  346. # QUICK_IMAGES works by intercepting asset URLs (those
  347. # with a date code ending in im_, js_...) and letting the
  348. # proxy pass them through. This may reduce load time
  349. # because Wayback doesn't have to hunt down the closest
  350. # copy of that asset to DATE, as those URLs have specific
  351. # date codes. This taints the HTML with web.archive.org
  352. # URLs. QUICK_IMAGES=2 uses the original URLs with an added
  353. # username:password, which taints less but is not supported
  354. # by all browsers - IE notably kills the whole page if it
  355. # sees an iframe pointing to an invalid URL.
  356. def filter_asset(match):
  357. if match.group(2) in (None, b'if_', b'fw_'): # non-asset URL
  358. return match.group(3) == b'https://' and b'http://' or match.group(3) # convert secure non-asset URLs to regular HTTP
  359. asset_type = match.group(2)
  360. if asset_type == b'js_': # stop JavaScript code injection
  361. asset_type = b'im_'
  362. if QUICK_IMAGES == 2:
  363. return b'http://' + match.group(1) + b':' + asset_type + b'@'
  364. else:
  365. return b'http://web.archive.org/web/' + match.group(1) + asset_type + b'/' + match.group(3)
  366. data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)?/([^:/]+:(?://)?)', filter_asset, data)
  367. else:
  368. # Remove asset URLs while simultaneously adding them to the date LRU cache
  369. # with their respective date and converting secure URLs to regular HTTP.
  370. def add_to_date_cache(match):
  371. orig_url = match.group(2)
  372. if orig_url[:8] == b'https://':
  373. orig_url = b'http://' + orig_url[8:]
  374. self.shared_state.date_cache[str(effective_date) + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore').replace('js_', 'im_')
  375. return orig_url
  376. data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
  377. elif mode == 1: # oocities
  378. # Remove viewport/cache-control/max-width code from the header.
  379. data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
  380. # Remove archive notice and tracking code from the footer.
  381. data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
  382. # Remove clearly labeled snippets from Geocities.
  383. data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  384. data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  385. data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
  386. # Fix links.
  387. data = re.sub(b'''//([^\\.]*\\.)?oocities\\.com/''', b'//\\1geocities.com/', data, flags=re.S)
  388. # Send patched page.
  389. self.send_response_headers(conn, http_version, content_type, request_url, content_length=len(data))
  390. self.request.sendall(data)
  391. self.request.close()
  392. else:
  393. # Pass non-HTML data through.
  394. self.send_passthrough(conn, http_version, content_type, request_url)
  395. def send_passthrough(self, conn, http_version, content_type, request_url):
  396. """Pass data through to the client unmodified (save for our headers)."""
  397. self.send_response_headers(conn, http_version, content_type, request_url, content_length=True)
  398. for data in conn.stream(1024):
  399. self.request.sendall(data)
  400. conn.release_conn()
  401. self.request.close()
  402. def send_response_headers(self, conn, http_version, content_type, request_url, content_length=False):
  403. """Generate and send the response headers."""
  404. # Pass the HTTP version, and error code if there is one.
  405. response = '{0} {1} {2}'.format(http_version, conn.status, conn.reason.replace('\n', ' '))
  406. # Add Content-Type, Content-Length and the caching ETag.
  407. response += '\r\nContent-Type: ' + content_type
  408. if type(content_length) == int:
  409. response += '\r\nContent-Length: ' + str(content_length)
  410. content_length = False # don't pass the original length through
  411. response += '\r\nETag: "' + request_url.replace('"', '') + '"'
  412. response += '\r\nConnection: close' # helps with IE6 trying to use proxy keep alive and holding half-open connections
  413. # Pass X-Archive-Orig-* (and Content-Length if requested) headers through.
  414. for header in conn.headers:
  415. if header.find('X-Archive-Orig-') == 0:
  416. orig_header = header[15:]
  417. # Skip headers which may affect client behavior.
  418. if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
  419. response += '\r\n' + orig_header + ': ' + conn.headers[header]
  420. elif content_length and header.lower() == 'content-length':
  421. response += '\r\n' + header + ': ' + conn.headers[header]
  422. # Finish and send the request.
  423. response += '\r\n\r\n'
  424. self.request.sendall(response.encode('utf8', 'ignore'))
  425. def send_error_page(self, http_version, code, reason):
  426. """Generate an error page."""
  427. # Get a description for this error code.
  428. if code in (404, 508): # page not archived or redirect loop
  429. description = 'This page may not be archived by the Wayback Machine.'
  430. elif code == 403: # not crawled due to exclusion
  431. description = 'This page was not archived due to a Wayback Machine exclusion.'
  432. elif code == 501: # method not implemented
  433. description = 'WaybackProxy only implements the GET method.'
  434. elif code == 502: # exception
  435. description = 'This page could not be fetched due to an unknown error.'
  436. elif code == 504: # timeout
  437. description = 'This page could not be fetched due to a Wayback Machine server error.'
  438. elif code == 412: # outside of tolerance
  439. description = 'The earliest snapshot for this page is outside of the configured tolerance interval.'
  440. elif code == 400 and reason == 'Host header missing': # no host header in transparent mode
  441. description = 'WaybackProxy\'s transparent mode requires an HTTP/1.1 compliant client.'
  442. else: # another error
  443. description = 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  444. # Read error page file.
  445. try:
  446. with open('error.html', 'r', encoding='utf8', errors='ignore') as f:
  447. error_page = f.read()
  448. except:
  449. # Just send the code and reason as a backup.
  450. error_page = '${code} ${reason}'
  451. # Format error page template.
  452. signature = self.signature()
  453. error_page = string.Template(error_page).substitute(**locals())
  454. error_page_len = len(error_page)
  455. # Send formatted error page and stop.
  456. self.request.sendall(
  457. '{http_version} {code} {reason}\r\n'
  458. 'Content-Type: text/html\r\n'
  459. 'Content-Length: {error_page_len}\r\n'
  460. '\r\n'
  461. '{error_page}'
  462. .format(**locals()).encode('utf8', 'ignore')
  463. )
  464. self.request.close()
  465. def send_redirect_page(self, http_version, target, code=302):
  466. """Generate a redirect page."""
  467. # make redirect page
  468. redirectpage = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url='
  469. redirectpage += target
  470. redirectpage += '"></head><body><p>If you are not redirected, <a href="'
  471. redirectpage += target
  472. redirectpage += '">click here</a>.</p></body></html>'
  473. # send redirect page and stop
  474. self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
  475. self.request.close()
  476. def guess_and_send_redirect(self, http_version, guess_url):
  477. # Heuristically determine the static URL for some redirect scripts.
  478. parsed = urllib.parse.urlparse(guess_url)
  479. match = re.search('''(?:^|&)[^=]+=((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', parsed.query, re.I) # URL in query parameters
  480. if not match:
  481. full_path = parsed.path
  482. if parsed.query:
  483. full_path += '?' + parsed.query
  484. match = re.search('''((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:(?:%2F|/).+|$))''', full_path, re.I) # URL in path or full query
  485. if match: # found URL
  486. # Decode and sanitize the URL.
  487. new_url = self.sanitize_redirect(urllib.parse.unquote_plus(match.group(1)))
  488. # Redirect client to the URL.
  489. _print('[r] [g]', new_url)
  490. self.send_redirect_page(http_version, new_url)
  491. return True
  492. return False
  493. def handle_settings(self, query):
  494. """Generate the settings page."""
  495. global DATE, DATE_TOLERANCE, GEOCITIES_FIX, QUICK_IMAGES, WAYBACK_API, CONTENT_TYPE_ENCODING, SILENT, SETTINGS_PAGE
  496. if query != '': # handle any parameters that may have been sent
  497. parsed = urllib.parse.parse_qs(query)
  498. if 'date' in parsed and 'dateTolerance' in parsed:
  499. if DATE != parsed['date'][0]:
  500. DATE = parsed['date'][0]
  501. self.shared_state.date_cache.clear()
  502. self.shared_state.availability_cache.clear()
  503. if DATE_TOLERANCE != parsed['dateTolerance'][0]:
  504. DATE_TOLERANCE = parsed['dateTolerance'][0]
  505. GEOCITIES_FIX = 'gcFix' in parsed
  506. QUICK_IMAGES = 'quickImages' in parsed
  507. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  508. # send the page and stop
  509. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  510. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  511. settingspage += self.signature()
  512. settingspage += '</b></p><form method="get" action="/">'
  513. settingspage += '<p>Date to get pages from: <input type="text" name="date" size="8" value="'
  514. settingspage += str(DATE)
  515. settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
  516. settingspage += str(DATE_TOLERANCE)
  517. settingspage += '"> days<br><input type="checkbox" name="gcFix"'
  518. if GEOCITIES_FIX:
  519. settingspage += ' checked'
  520. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  521. if QUICK_IMAGES:
  522. settingspage += ' checked'
  523. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  524. if CONTENT_TYPE_ENCODING:
  525. settingspage += ' checked'
  526. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  527. self.request.send(settingspage.encode('utf8', 'ignore'))
  528. self.request.close()
  529. def sanitize_redirect(self, url):
  530. """Sanitize an URL for client-side redirection."""
  531. if url[0] != '/' and '://' not in url:
  532. # Add protocol if the URL is absolute but missing a protocol.
  533. return 'http://' + url
  534. elif url[:8].lower() == 'https://':
  535. # Convert secure URLs to regular HTTP.
  536. return 'http://' + url[8:]
  537. else:
  538. # No changes required.
  539. return url
  540. def signature(self):
  541. """Return the server signature."""
  542. return 'WaybackProxy on {0}'.format(socket.gethostname())
  543. def wayback_to_datetime(self, date):
  544. """Convert a Wayback format date string to a datetime.datetime object."""
  545. date = str(date)
  546. fmt = '%Y%m%d%H%M%S'
  547. fmt_len = 14
  548. while fmt:
  549. try:
  550. return datetime.datetime.strptime(date[:fmt_len], fmt)
  551. except:
  552. fmt = fmt[:-2]
  553. fmt_len -= 2
  554. print_lock = threading.Lock()
  555. def _print(*args, **kwargs):
  556. """Logging function."""
  557. if SILENT:
  558. return
  559. with print_lock:
  560. print(*args, **kwargs, flush=True)
  561. def main():
  562. """Starts the server."""
  563. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  564. _print('[-] Now listening on port', LISTEN_PORT)
  565. _print('[-] Date set to', DATE)
  566. try:
  567. server.serve_forever()
  568. except KeyboardInterrupt: # Ctrl+C to stop
  569. pass
  570. if __name__ == '__main__':
  571. main()