waybackproxy.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #!/usr/bin/env python
  2. import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
  3. from config import *
  4. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  5. """TCPServer with ThreadingMixIn added."""
  6. pass
  7. class Handler(socketserver.BaseRequestHandler):
  8. """Main request handler."""
  9. def handle(self):
  10. """Handle a request."""
  11. global DATE
  12. # readline is pretty convenient
  13. f = self.request.makefile()
  14. # read request line
  15. reqline = line = f.readline()
  16. split = line.rstrip('\r\n').split(' ')
  17. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  18. if split[0] != 'GET':
  19. # only GET is implemented
  20. return self.error_page(http_version, 501, 'Not Implemented')
  21. # parse the URL
  22. request_url = archived_url = split[1]
  23. parsed = urllib.parse.urlparse(request_url)
  24. # make a path
  25. path = parsed.path
  26. if parsed.query != '': path += '?' + parsed.query
  27. if path == '': path == '/'
  28. # get the hostname for later
  29. host = parsed.netloc.split(':')
  30. hostname = host[0]
  31. # read out the headers, saving the PAC file host
  32. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  33. auth = None
  34. while line.rstrip('\r\n') != '':
  35. line = f.readline()
  36. ll = line.lower()
  37. if ll[:6] == 'host: ':
  38. pac_host = line[6:].rstrip('\r\n')
  39. if ':' not in pac_host: # who would run this on port 80 anyway?
  40. pac_host += ':80'
  41. elif ll[:21] == 'x-waybackproxy-date: ':
  42. # API for a personal project of mine
  43. new_date = line[21:].rstrip('\r\n')
  44. if DATE != new_date:
  45. DATE = new_date
  46. print('[-] Header requested date', DATE)
  47. elif ll[:21] == 'authorization: basic ':
  48. # asset date code passed as username:password
  49. auth = base64.b64decode(ll[21:])
  50. try:
  51. if path == '/proxy.pac':
  52. # PAC file to bypass QUICK_IMAGES requests
  53. pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n'''
  54. pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  55. pac += b'''\r\n'''
  56. pac += b'''function FindProxyForURL(url, host)\r\n'''
  57. pac += b'''{\r\n'''
  58. pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
  59. pac += b''' {\r\n'''
  60. pac += b''' return "DIRECT";\r\n'''
  61. pac += b''' }\r\n'''
  62. pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n'''
  63. pac += b'''}\r\n'''
  64. self.request.sendall(pac)
  65. return
  66. elif hostname == 'web.archive.org' or auth:
  67. if path[:5] != '/web/':
  68. # launch settings
  69. return self.handle_settings(parsed.query)
  70. else:
  71. # pass-through requests to web.archive.org
  72. # required for QUICK_IMAGES
  73. # did we get an username:password with an asset date code?
  74. if auth:
  75. request_url = 'http://web.archive.org/web/{0}/{1}'.format(auth.replace(':', ''), archived_url)
  76. else:
  77. archived_url = '/'.join(request_url.split('/')[5:])
  78. _print('[>] [QI] {0}'.format(archived_url))
  79. try:
  80. conn = urllib.request.urlopen(request_url)
  81. except urllib.error.HTTPError as e:
  82. if e.code == 404:
  83. # Try this file on another date, might be redundant
  84. return self.redirect_page(http_version, archived_url)
  85. else:
  86. raise e
  87. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  88. # apply GEOCITIES_FIX and pass it through
  89. _print('[>] {0}'.format(archived_url))
  90. split = archived_url.split('/')
  91. hostname = split[2] = 'www.oocities.org'
  92. request_url = '/'.join(split)
  93. conn = urllib.request.urlopen(request_url)
  94. else:
  95. # get from Wayback
  96. _print('[>] {0}'.format(archived_url))
  97. request_url = 'http://web.archive.org/web/{0}/{1}'.format(DATE, archived_url)
  98. conn = urllib.request.urlopen(request_url)
  99. except urllib.error.HTTPError as e:
  100. # an error has been found
  101. # 403 or 404 => heuristically determine the static URL for some redirect scripts
  102. if e.code in (403, 404):
  103. match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to|go)?(?:url)?=(http[^&]+)''', archived_url, re.IGNORECASE)
  104. if not match:
  105. match = re.search('''/(?:target|trg|dest(?:ination)?|to|go)?(?:url)?/(http.+)''', archived_url, re.IGNORECASE)
  106. if match:
  107. # we found it
  108. new_url = urllib.parse.unquote_plus(match.group(1))
  109. _print('[r]', new_url)
  110. return self.redirect_page(http_version, new_url)
  111. _print('[!] {0} {1}'.format(e.code, e.reason))
  112. return self.error_page(http_version, e.code, e.reason)
  113. # get content type
  114. content_type = conn.info().get('Content-Type')
  115. if content_type == None: content_type = 'text/html'
  116. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  117. # set the mode: [0]wayback [1]oocities
  118. mode = 0
  119. if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
  120. if 'text/html' in content_type: # HTML
  121. # Some dynamically generated links may end up pointing to
  122. # web.archive.org. Correct that by redirecting the Wayback
  123. # portion of the URL away if it ends up being HTML consumed
  124. # through the QUICK_IMAGES interface.
  125. if hostname == 'web.archive.org':
  126. conn.close()
  127. return self.redirect_page(http_version, '/'.join(request_url.split('/')[5:]), 301)
  128. # consume all data
  129. data = conn.read()
  130. # patch the page
  131. if mode == 0: # wayback
  132. if b'<title>Wayback Machine</title>' in data:
  133. match = re.search(b'<iframe id="playback" src="((?:(?:http(?:s)?:)?//web.archive.org)?/web/[^"]+)"', data)
  134. if match:
  135. # media playback iframe
  136. # Some websites (especially ones that use frames)
  137. # inexplicably render inside a media playback iframe.
  138. # In that case, a simple redirect would result in a
  139. # redirect loop. Download the URL and render it instead.
  140. request_url = match.group(1).decode('ascii', 'ignore')
  141. archived_url = '/'.join(request_url.split('/')[5:])
  142. print('[f]', archived_url)
  143. try:
  144. conn = urllib.request.urlopen(request_url)
  145. except urllib.error.HTTPError as e:
  146. _print('[!]', e.code, e.reason)
  147. return self.error_page(http_version, e.code, e.reason)
  148. content_type = conn.info().get('Content-Type')
  149. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  150. data = conn.read()
  151. if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
  152. match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/(?:[^/]+)/([^"]+)">Impatient\?</a></p>', data)
  153. if match:
  154. # wayback redirect page, follow it
  155. match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
  156. try:
  157. redirect_code = int(match2.group(1))
  158. except:
  159. redirect_code = 302
  160. archived_url = match.group(1).decode('ascii', 'ignore')
  161. print('[r]', archived_url)
  162. return self.redirect_page(http_version, archived_url, redirect_code)
  163. # pre-toolbar scripts and CSS
  164. data = re.sub(b'<script src="//archive\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->', b'', data, flags=re.S)
  165. # toolbar
  166. data = re.sub(b'<!-- BEGIN WAYBACK TOOLBAR INSERT -->(?:.*)<!-- END WAYBACK TOOLBAR INSERT -->', b'', data, flags=re.S)
  167. # comments on footer
  168. data = re.sub(b'\n<!--\n FILE ARCHIVED (?:.*)$', b'', data, flags=re.S)
  169. # fix base tag
  170. data = re.sub(b'(<base (?:[^>]*)href=(?:["\'])?)(?:(?:http(?:s)?:)?//web.archive.org)?/web/(?:[^/]+)/', b'\\1', data, flags=re.I + re.S)
  171. # remove extraneous :80 from links
  172. data = re.sub(b'((?:(?:http(?:s)?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
  173. # fix links
  174. if QUICK_IMAGES:
  175. # QUICK_IMAGES works by intercepting asset URLs (those
  176. # with a date code ending in im_, js_...) and letting the
  177. # proxy pass them through. This may reduce load time
  178. # because Wayback doesn't have to hunt down the closest
  179. # copy of that asset to DATE, as those URLs have specific
  180. # date codes. This taints the HTML with web.archive.org
  181. # URLs. QUICK_IMAGES=2 uses the original URLs with an added
  182. # username:password, which taints less but is not supported
  183. # by all browsers - IE6 notably kills the whole page if it
  184. # sees an iframe pointing to an invalid URL.
  185. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
  186. QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
  187. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
  188. else:
  189. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
  190. elif mode == 1: # oocities
  191. # viewport/cache-control/max-width code (header)
  192. data = re.sub(b'^(?:.*?)\n\n', b'', data, flags=re.S)
  193. # archive notice and tracking code (footer)
  194. data = re.sub(b'<style> \n.zoomout { -webkit-transition: (?:.*)$', b'', data, flags=re.S)
  195. # clearly labeled snippets from Geocities
  196. data = re.sub(b'^(?:.*)<\!-- text above generated by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
  197. data = re.sub(b'<\!-- following code added by server\. PLEASE REMOVE -->(?:.*)<\!-- preceding code added by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
  198. data = re.sub(b'<\!-- text below generated by server\. PLEASE REMOVE -->(?:.*)$', b'', data, flags=re.S)
  199. # fix links
  200. data = re.sub(b'//([^.]*)\.oocities\.com/', b'//\\1.geocities.com/', data, flags=re.S)
  201. self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
  202. self.request.sendall(data)
  203. else: # other data
  204. self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
  205. while True:
  206. data = conn.read(1024)
  207. if not data: break
  208. self.request.sendall(data)
  209. self.request.close()
  210. def error_page(self, http_version, code, reason):
  211. """Generate an error page."""
  212. # make error page
  213. errorpage = '<html><head><title>{0} {1}</title></head><body><h1>{1}</h1><p>'.format(code, reason)
  214. # add code information
  215. if code == 404: # page not archived
  216. errorpage += 'This page may not be archived by the Wayback Machine.'
  217. elif code == 403: # not crawled due to robots.txt
  218. errorpage += 'This page was not archived due to a robots.txt block.'
  219. elif code == 501: # method not implemented
  220. errorpage += 'WaybackProxy only implements the GET method.'
  221. else: # another error
  222. errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  223. errorpage += '</p><hr><i>'
  224. errorpage += self.signature()
  225. errorpage += '</i></body></html>'
  226. # send error page and stop
  227. self.request.sendall('{0} {1} {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, reason, len(errorpage), errorpage).encode('utf8', 'ignore'))
  228. self.request.close()
  229. def redirect_page(self, http_version, target, code=302):
  230. """Generate a redirect page."""
  231. # make redirect page
  232. redirectpage = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url='
  233. redirectpage += target
  234. redirectpage += '"></head><body><p>If you are not redirected, <a href="'
  235. redirectpage += target
  236. redirectpage += '">click here</a>.</p></body></html>'
  237. # send redirect page and stop
  238. self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
  239. self.request.close()
  240. def handle_settings(self, query):
  241. """Generate the settings page."""
  242. global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
  243. if query != '': # handle any parameters that may have been sent
  244. parsed = urllib.parse.parse_qs(query)
  245. if 'date' in parsed: DATE = parsed['date'][0]
  246. GEOCITIES_FIX = 'gcFix' in parsed
  247. QUICK_IMAGES = 'quickImages' in parsed
  248. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  249. # send the page and stop
  250. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  251. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  252. settingspage += self.signature()
  253. settingspage += '</b></p><form method="get" action="/"><p>Date to get pages from: <input type="text" name="date" size="8" value="'
  254. settingspage += DATE
  255. settingspage += '"><br><input type="checkbox" name="gcFix"'
  256. if GEOCITIES_FIX: settingspage += ' checked'
  257. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  258. if QUICK_IMAGES: settingspage += ' checked'
  259. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  260. if CONTENT_TYPE_ENCODING: settingspage += ' checked'
  261. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  262. self.request.send(settingspage.encode('utf8', 'ignore'))
  263. self.request.close()
  264. def signature(self):
  265. """Return the server signature."""
  266. return 'WaybackProxy on {0}'.format(socket.gethostname())
  267. print_lock = threading.Lock()
  268. def _print(*args, linebreak=True):
  269. """Logging function."""
  270. if SILENT: return
  271. s = ' '.join([str(x) for x in args])
  272. print_lock.acquire()
  273. sys.stdout.write(linebreak and (s + '\n') or s)
  274. sys.stdout.flush()
  275. print_lock.release()
  276. def main():
  277. """Starts the server."""
  278. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  279. _print('[-] Now listening on port {0}'.format(LISTEN_PORT))
  280. try:
  281. server.serve_forever()
  282. except KeyboardInterrupt: # Ctrl+C to stop
  283. pass
  284. if __name__ == '__main__':
  285. main()