waybackproxy.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. #!/usr/bin/env python
  2. import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
  3. from config import *
  4. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  5. """TCPServer with ThreadingMixIn added."""
  6. pass
  7. class Handler(socketserver.BaseRequestHandler):
  8. """Main request handler."""
  9. def handle(self):
  10. """Handle a request."""
  11. global DATE
  12. # readline is pretty convenient
  13. f = self.request.makefile()
  14. # read request line
  15. reqline = line = f.readline()
  16. split = line.rstrip('\r\n').split(' ')
  17. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  18. if split[0] != 'GET':
  19. # only GET is implemented
  20. return self.error_page(http_version, 501, 'Not Implemented')
  21. # parse the URL
  22. request_url = split[1]
  23. parsed = urllib.parse.urlparse(request_url)
  24. # make a path
  25. path = parsed.path
  26. if parsed.query != '': path += '?' + parsed.query
  27. if path == '': path == '/'
  28. # get the hostname for later
  29. host = parsed.netloc.split(':')
  30. hostname = host[0]
  31. # read out the headers, saving the PAC file host
  32. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  33. while line.rstrip('\r\n') != '':
  34. line = f.readline()
  35. if line[:6].lower() == 'host: ':
  36. pac_host = line[6:].rstrip('\r\n')
  37. if ':' not in pac_host: # who would run this on port 80 anyway?
  38. pac_host += ':80'
  39. elif line[:21].lower() == 'x-waybackproxy-date: ':
  40. # API for a personal project of mine
  41. new_date = line[21:].rstrip('\r\n')
  42. if DATE != new_date:
  43. DATE = new_date
  44. print('[-] Header requested date', DATE)
  45. try:
  46. if path == '/proxy.pac':
  47. # PAC file to bypass QUICK_IMAGES requests
  48. pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n'''
  49. pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  50. pac += b'''\r\n'''
  51. pac += b'''function FindProxyForURL(url, host)\r\n'''
  52. pac += b'''{\r\n'''
  53. pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*"))\r\n'''
  54. pac += b''' {\r\n'''
  55. pac += b''' return "DIRECT";\r\n'''
  56. pac += b''' }\r\n'''
  57. pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n'''
  58. pac += b'''}\r\n'''
  59. self.request.sendall(pac)
  60. return
  61. elif hostname == 'web.archive.org':
  62. if path[:5] != '/web/':
  63. # launch settings
  64. return self.handle_settings(parsed.query)
  65. else:
  66. # pass-through requests to web.archive.org
  67. # required for QUICK_IMAGES
  68. _print('[>] [QI] {0}'.format('/'.join(request_url.split('/')[5:])))
  69. conn = urllib.request.urlopen(request_url)
  70. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  71. # apply GEOCITIES_FIX and pass it through
  72. split = request_url.split('/')
  73. hostname = split[2] = 'www.oocities.org'
  74. request_url = '/'.join(split)
  75. _print('[>] {0}'.format(request_url))
  76. conn = urllib.request.urlopen(request_url)
  77. else:
  78. # get from Wayback
  79. _print('[>] {0}'.format(request_url))
  80. conn = urllib.request.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url))
  81. except urllib.error.HTTPError as e:
  82. # an error has been found
  83. # 403 or 404 => heuristically determine the static URL for some redirect scripts
  84. if e.code in (403, 404):
  85. match = re.search('''(?:\?|&)(?:target|trg|dest(?:ination)?|to)(?:url)?=(http[^&]+)''', request_url, re.IGNORECASE)
  86. if match:
  87. # we found it
  88. new_url = urllib.parse.unquote_plus(match.group(1))
  89. _print('[r]', new_url)
  90. return self.redirect_page(http_version, new_url)
  91. _print('[!] {0} {1}'.format(e.code, e.reason))
  92. return self.error_page(http_version, e.code, e.reason)
  93. # get content type
  94. content_type = conn.info().get('Content-Type')
  95. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  96. # send headers
  97. self.request.sendall(http_version.encode('ascii', 'ignore') + b' 200 OK\r\nContent-Type: ' + content_type.encode('ascii', 'ignore') + b'\r\n\r\n')
  98. # set the mode: [0]wayback [1]oocities
  99. mode = 0
  100. if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
  101. if content_type[:9] == 'text/html' in content_type: # HTML
  102. toolbar = mode == 1 # oocities header starts without warning
  103. redirect_page = False
  104. for line in conn:
  105. line = line.rstrip(b'\r\n')
  106. if mode == 0:
  107. if toolbar:
  108. for delimiter in (b'<\!-- END WAYBACK TOOLBAR INSERT -->', b'<\!-- End Wayback Rewrite JS Include -->'):
  109. if re.search(delimiter, line):
  110. # toolbar is done - resume relaying on the next line
  111. toolbar = False
  112. line = re.sub(delimiter, b'', line)
  113. break
  114. if toolbar: continue
  115. elif redirect_page:
  116. # this is a really bad way to deal with Wayback's 302
  117. # pages, but necessary with the way this proxy works
  118. match = re.search(b'<p class="impatient"><a href="/web/(?:[^/]+)/([^"]+)">Impatient\\?</a></p>', line)
  119. if match:
  120. line = b'<title>WaybackProxy Redirect</title><meta http-equiv="refresh" content="0;url='
  121. line += match.group(1)
  122. line += b'"></head><body>If you are not redirected, <a href="'
  123. line += match.group(1)
  124. line += b'">click here</a>.</body></html>'
  125. self.request.sendall(line)
  126. break
  127. continue
  128. if b'<base ' in line.lower():
  129. # fix base
  130. line = re.sub(b'(?:http://web\.archive\.org)?/web/([0-9]+)/', b'', line)
  131. elif line == b'\t\t<title>Internet Archive Wayback Machine</title>':
  132. # redirect 302s - see the redirect_page code above
  133. redirect_page = True
  134. continue
  135. else:
  136. for delimiter in (
  137. b'<\!-- BEGIN WAYBACK TOOLBAR INSERT -->',
  138. b'<script src="//archive\.org/([^"]+)" type="text/javascript"></script>'
  139. ):
  140. if re.search(delimiter, line):
  141. # remove the toolbar - stop relaying from now on
  142. toolbar = True
  143. line = re.sub(delimiter, b'', line)
  144. break
  145. if QUICK_IMAGES:
  146. # QUICK_IMAGES works by intercepting asset URLs (those
  147. # with a date code ending in im_, js_...) and letting the
  148. # proxy pass them through. This may reduce load time
  149. # because Wayback doesn't have to hunt down the closest
  150. # copy of that asset to DATE, as those URLs have specific
  151. # date codes. The only side effect is tainting the HTML
  152. # with web.archive.org URLs.
  153. line = re.sub(b'(?:http://web.archive.org)?/web/([0-9]+)([a-z]+_)/',
  154. b'http://web.archive.org/web/\\1\\2/', line)
  155. line = re.sub(b'(?:http://web.archive.org)?/web/([0-9]+)/', b'', line)
  156. else:
  157. line = re.sub(b'(?:http://web.archive.org)?/web/([^/]+)/', b'', line)
  158. elif mode == 1:
  159. # remove the geocities/oocities-added code, which is
  160. # conveniently wrapped around comments
  161. if toolbar:
  162. if line in (
  163. b'<!-- text above generated by server. PLEASE REMOVE -->',
  164. b'<!-- preceding code added by server. PLEASE REMOVE -->'
  165. ):
  166. toolbar = False
  167. continue
  168. elif line == b'<!-- following code added by server. PLEASE REMOVE -->' \
  169. or line[:54] == b'<!-- text below generated by server. PLEASE REMOVE -->':
  170. toolbar = True
  171. continue
  172. # taint? what taint?
  173. line = line.replace(b'http://oocities.com', b'http://geocities.com')
  174. line = line.replace(b'http://www.oocities.com', b'http://www.geocities.com')
  175. self.request.sendall(line)
  176. self.request.sendall(b'\r\n')
  177. else: # other data
  178. while True:
  179. data = conn.read(1024)
  180. if not data: break
  181. self.request.sendall(data)
  182. self.request.close()
  183. def error_page(self, http_version, code, reason):
  184. """Generate an error page."""
  185. # make error page
  186. errorpage = '<html><head><title>{0} {1}</title></head><body><h1>{1}</h1><p>'.format(code, reason)
  187. # add code information
  188. if code == 404: # page not archived
  189. errorpage += 'This page may not be archived by the Wayback Machine.'
  190. elif code == 403: # not crawled due to robots.txt
  191. errorpage += 'This page was not archived due to a robots.txt block.'
  192. elif code == 501: # method not implemented
  193. errorpage += 'WaybackProxy only implements the GET method.'
  194. else: # another error
  195. errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  196. errorpage += '</p><hr><i>'
  197. errorpage += self.signature()
  198. errorpage += '</i></body></html>'
  199. # send error page and stop
  200. self.request.sendall('{0} {1} {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, reason, len(errorpage), errorpage).encode('utf8', 'ignore'))
  201. self.request.close()
  202. def redirect_page(self, http_version, target, code=302):
  203. """Generate a redirect page."""
  204. # make redirect page
  205. redirectpage = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url='
  206. redirectpage += target
  207. redirectpage += '"></head><body><p>If you are not redirected, <a href="'
  208. redirectpage += target
  209. redirectpage += '">click here</a>.</p></body></html>'
  210. # send redirect page and stop
  211. self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
  212. self.request.close()
  213. def handle_settings(self, query):
  214. """Generate the settings page."""
  215. global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
  216. if query != '': # handle any parameters that may have been sent
  217. parsed = urllib.parse.parse_qs(query)
  218. if 'date' in parsed: DATE = parsed['date'][0]
  219. GEOCITIES_FIX = 'gcFix' in parsed
  220. QUICK_IMAGES = 'quickImages' in parsed
  221. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  222. # send the page and stop
  223. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  224. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  225. settingspage += self.signature()
  226. settingspage += '</b></p><form method="get" action="/"><p>Date to get pages from: <input type="text" name="date" size="8" value="'
  227. settingspage += DATE
  228. settingspage += '"><br><input type="checkbox" name="gcFix"'
  229. if GEOCITIES_FIX: settingspage += ' checked'
  230. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  231. if QUICK_IMAGES: settingspage += ' checked'
  232. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  233. if CONTENT_TYPE_ENCODING: settingspage += ' checked'
  234. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  235. self.request.send(settingspage.encode('utf8', 'ignore'))
  236. self.request.close()
  237. def signature(self):
  238. """Return the server signature."""
  239. return 'WaybackProxy on {0}'.format(socket.gethostname())
  240. print_lock = threading.Lock()
  241. def _print(*args, linebreak=True):
  242. """Logging function."""
  243. s = ' '.join(args)
  244. print_lock.acquire()
  245. sys.stdout.write(linebreak and (s + '\n') or s)
  246. sys.stdout.flush()
  247. print_lock.release()
  248. def main():
  249. """Starts the server."""
  250. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  251. _print('[-] Now listening on port {0}'.format(LISTEN_PORT))
  252. try:
  253. server.serve_forever()
  254. except KeyboardInterrupt: # Ctrl+C to stop
  255. pass
  256. if __name__ == '__main__':
  257. main()