waybackproxy.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #!/usr/bin/env python
  2. import re, socket, SocketServer, urllib2, urlparse
  3. from config import *
  4. class ThreadingTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
  5. """TCPServer with ThreadingMixIn added."""
  6. pass
  7. class Handler(SocketServer.BaseRequestHandler):
  8. """Main request handler."""
  9. def handle(self):
  10. """Handle a request."""
  11. # readline is pretty convenient
  12. f = self.request.makefile()
  13. # read request line
  14. reqline = line = f.readline()
  15. split = line.rstrip('\r\n').split(' ')
  16. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  17. if split[0] != 'GET':
  18. # only GET is implemented
  19. return self.error_page(http_version, 501, 'Not Implemented')
  20. # parse the URL
  21. request_url = split[1]
  22. parsed = urlparse.urlparse(request_url)
  23. # make a path
  24. path = parsed.path
  25. if parsed.query != '': path += '?' + parsed.query
  26. if path == '': path == '/'
  27. # get the hostname for later
  28. host = parsed.netloc.split(':')
  29. hostname = host[0]
  30. # read out the headers
  31. while line.rstrip('\r\n') != '':
  32. line = f.readline()
  33. try:
  34. if hostname == 'web.archive.org':
  35. if path[:5] != '/web/':
  36. # launch settings
  37. return self.handle_settings(parsed.query)
  38. else:
  39. # pass-through requests to web.archive.org
  40. # required for QUICK_IMAGES
  41. print '[>] [QI] {0}'.format('/'.join(request_url.split('/')[5:]))
  42. conn = urllib2.urlopen(request_url)
  43. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  44. # apply GEOCITIES_FIX and pass it through
  45. split = request_url.split('/')
  46. hostname = split[2] = 'www.oocities.org'
  47. request_url = '/'.join(split)
  48. print '[>] {0}'.format(request_url)
  49. conn = urllib2.urlopen(request_url)
  50. else:
  51. # get from Wayback
  52. print '[>] {0}'.format(request_url)
  53. conn = urllib2.urlopen('http://web.archive.org/web/{0}/{1}'.format(DATE, request_url))
  54. except urllib2.HTTPError as e:
  55. # an error has been found
  56. print '[!] {0} {1}'.format(e.code, e.reason)
  57. return self.error_page(http_version, e.code, e.reason)
  58. # get content type
  59. content_type = conn.info().getheader('Content-Type')
  60. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  61. # send headers
  62. self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\n\r\n'.format(http_version, content_type))
  63. # set the mode: [0]wayback [1]oocities
  64. mode = 0
  65. if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
  66. if content_type[:9] == 'text/html' in content_type: # HTML
  67. toolbar = mode == 1 # oocities header starts without warning
  68. after_header = False
  69. redirect_page = False
  70. for line in conn:
  71. line = line.rstrip('\r\n')
  72. if mode == 0:
  73. if toolbar:
  74. if line == '<!-- END WAYBACK TOOLBAR INSERT -->':
  75. # toolbar is done - resume relaying on the next line
  76. toolbar = False
  77. after_header = True
  78. continue
  79. elif redirect_page:
  80. # this is a really bad way to deal with Wayback's 302
  81. # pages, but necessary with the way this proxy works
  82. match = re.search('<p class="impatient"><a href="/web/(?:[^/]+)/([^"]+)">Impatient\\?</a></p>', line)
  83. if match:
  84. line = '<title>WaybackProxy Redirect</title><meta http-equiv="refresh" content="0;url='
  85. line += match.group(1)
  86. line += '"></head><body>If you are not redirected, <a href="'
  87. line += match.group(1)
  88. line += '">click here</a>.</body></html>'
  89. self.request.sendall(line)
  90. break
  91. continue
  92. if not after_header:
  93. ll = line.lower()
  94. if line == '<script type="text/javascript" src="/static/js/analytics.js"></script>' or line == '<link type="text/css" rel="stylesheet" href="/static/css/banner-styles.css"/>' or line[:69] == '<script type="text/javascript">archive_analytics.values.server_name="':
  95. # remove the CSS and tracking scripts added to <head>
  96. continue
  97. elif ll[:6] == '<base ':
  98. # fix base
  99. line = re.sub('/web/([0-9]+)/', '', line)
  100. if line == '<!-- BEGIN WAYBACK TOOLBAR INSERT -->':
  101. # remove the toolbar - stop relaying from now on
  102. toolbar = True
  103. continue
  104. elif line == '\t\t<title>Internet Archive Wayback Machine</title>':
  105. # redirect 302s - see the redirect_page code above
  106. redirect_page = True
  107. continue
  108. if QUICK_IMAGES:
  109. # QUICK_IMAGES works by intercepting asset URLs (those
  110. # with a date code ending in im_, js_...) and letting the
  111. # proxy pass them through. This may reduce load time
  112. # because Wayback doesn't have to hunt down the closest
  113. # copy of that asset to DATE, as those URLs have specific
  114. # date codes. The only side effect is tainting the HTML
  115. # with web.archive.org URLs.
  116. line = re.sub('/web/([0-9]+)([a-z]+_)/',
  117. 'http://web.archive.org/web/\\1\\2/', line)
  118. line = re.sub('/web/([0-9]+)/', '', line)
  119. else:
  120. line = re.sub('/web/([^/]+)/', '', line)
  121. elif mode == 1:
  122. # remove the geocities/oocities-added code, which is
  123. # conveniently wrapped around comments
  124. if toolbar:
  125. if line in ['<!-- text above generated by server. PLEASE REMOVE -->', '<!-- preceding code added by server. PLEASE REMOVE -->']:
  126. toolbar = False
  127. continue
  128. elif line == '<!-- following code added by server. PLEASE REMOVE -->' or line[:54] == '<!-- text below generated by server. PLEASE REMOVE -->':
  129. toolbar = True
  130. continue
  131. # taint? what taint?
  132. line = line.replace('http://oocities.com', 'http://geocities.com')
  133. line = line.replace('http://www.oocities.com', 'http://www.geocities.com')
  134. self.request.sendall(line)
  135. self.request.sendall('\r\n')
  136. else: # other data
  137. while True:
  138. data = conn.read(1024)
  139. if data == '': break
  140. self.request.sendall(data)
  141. self.request.close()
  142. def error_page(self, http_version, code, reason):
  143. """Generate an error page."""
  144. # make error page
  145. errorpage = '<html><head><title>{0} {1}</title></head><body><h1>{1}</h1><p>'.format(code, reason)
  146. # add code information
  147. if code == 404: # page not archived
  148. errorpage += 'This page may not be archived by the Wayback Machine.'
  149. elif code == 403: # not crawled due to robots.txt
  150. errorpage += 'This page was not archived due to a robots.txt block.'
  151. elif code == 501: # method not implemented
  152. errorpage += 'WaybackProxy only implements the GET method.'
  153. else: # another error
  154. errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  155. errorpage += '</p><hr><i>{0}</i></body></html>'.format(self.signature())
  156. # send error page and stop
  157. self.request.sendall('{0} {1} {2}\r\nContent-Length: {3}\r\n\r\n'.format(http_version, code, reason, len(errorpage)))
  158. self.request.sendall(errorpage)
  159. self.request.close()
  160. def handle_settings(self, query):
  161. """Generate the settings page."""
  162. global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
  163. if query != '': # handle any parameters that may have been sent
  164. parsed = urlparse.parse_qs(query)
  165. if 'date' in parsed: DATE = parsed['date'][0]
  166. GEOCITIES_FIX = 'gcFix' in parsed
  167. QUICK_IMAGES = 'quickImages' in parsed
  168. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  169. # send the page and stop
  170. self.request.sendall('HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n')
  171. self.request.sendall('<html><head><title>WaybackProxy Settings</title></head><body><p><b>')
  172. self.request.sendall(self.signature())
  173. self.request.sendall('</b></p><form method="get" action="/"><p>Date to get pages from: <input type="text" name="date" size="8" value="')
  174. self.request.sendall(DATE)
  175. self.request.sendall('"><br><input type="checkbox" name="gcFix"')
  176. if GEOCITIES_FIX: self.request.sendall(' checked')
  177. self.request.sendall('> Geocities Fix<br><input type="checkbox" name="quickImages"')
  178. if QUICK_IMAGES: self.request.sendall(' checked')
  179. self.request.sendall('> Quick images<br><input type="checkbox" name="ctEncoding"')
  180. if CONTENT_TYPE_ENCODING: self.request.sendall(' checked')
  181. self.request.sendall('> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>')
  182. self.request.close()
  183. def signature(self):
  184. """Return the server signature."""
  185. return 'WaybackProxy on {0}'.format(socket.gethostname())
  186. def main():
  187. """Starts the server."""
  188. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  189. print '[-] Now listening on port {0}'.format(LISTEN_PORT)
  190. try:
  191. server.serve_forever()
  192. except KeyboardInterrupt: # Ctrl+C to stop
  193. pass
  194. if __name__ == '__main__':
  195. main()