waybackproxy.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. #!/usr/bin/env python
  2. import base64, datetime, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
  3. from config import *
  4. # internal LRU dictionary for preserving URLs on redirect
  5. date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
  6. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  7. """TCPServer with ThreadingMixIn added."""
  8. pass
  9. class Handler(socketserver.BaseRequestHandler):
  10. """Main request handler."""
  11. def handle(self):
  12. """Handle a request."""
  13. global DATE
  14. # readline is pretty convenient
  15. f = self.request.makefile()
  16. # read request line
  17. reqline = line = f.readline()
  18. split = line.rstrip('\r\n').split(' ')
  19. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  20. if split[0] != 'GET':
  21. # only GET is implemented
  22. return self.error_page(http_version, 501, 'Not Implemented')
  23. # parse the URL
  24. request_url = archived_url = split[1]
  25. parsed = urllib.parse.urlparse(request_url)
  26. # make a path
  27. path = parsed.path
  28. if parsed.query != '': path += '?' + parsed.query
  29. if path == '': path == '/'
  30. # get the hostname for later
  31. host = parsed.netloc.split(':')
  32. hostname = host[0]
  33. # read out the headers, saving the PAC file host
  34. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  35. effective_date = DATE
  36. auth = None
  37. while line.rstrip('\r\n') != '':
  38. line = f.readline()
  39. ll = line.lower()
  40. if ll[:6] == 'host: ':
  41. pac_host = line[6:].rstrip('\r\n')
  42. if ':' not in pac_host: # who would run this on port 80 anyway?
  43. pac_host += ':80'
  44. elif ll[:21] == 'x-waybackproxy-date: ':
  45. # API for a personal project of mine
  46. effective_date = line[21:].rstrip('\r\n')
  47. elif ll[:21] == 'authorization: basic ':
  48. # asset date code passed as username:password
  49. auth = base64.b64decode(ll[21:])
  50. original_date = effective_date
  51. # get cached date for redirects, if available
  52. effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
  53. # get date from username:password, if available
  54. if auth:
  55. effective_date = auth.replace(':', '')
  56. try:
  57. if path in ('/proxy.pac', '/wpad.dat', '/wpad.da'):
  58. # PAC file to bypass QUICK_IMAGES requests
  59. pac = http_version.encode('ascii', 'ignore') + b''' 200 OK\r\n'''
  60. pac += b'''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  61. pac += b'''\r\n'''
  62. pac += b'''function FindProxyForURL(url, host)\r\n'''
  63. pac += b'''{\r\n'''
  64. pac += b''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
  65. pac += b''' {\r\n'''
  66. pac += b''' return "DIRECT";\r\n'''
  67. pac += b''' }\r\n'''
  68. pac += b''' return "PROXY ''' + pac_host.encode('ascii', 'ignore') + b'''";\r\n'''
  69. pac += b'''}\r\n'''
  70. self.request.sendall(pac)
  71. return
  72. elif hostname == 'web.archive.org':
  73. if path[:5] != '/web/':
  74. # launch settings
  75. return self.handle_settings(parsed.query)
  76. else:
  77. # pass-through requests to web.archive.org
  78. # required for QUICK_IMAGES
  79. archived_url = '/'.join(request_url.split('/')[5:])
  80. _print('[>] [QI] {0}'.format(archived_url))
  81. try:
  82. conn = urllib.request.urlopen(request_url)
  83. except urllib.error.HTTPError as e:
  84. if e.code == 404:
  85. # Try this file on another date, might be redundant
  86. return self.redirect_page(http_version, archived_url)
  87. else:
  88. raise e
  89. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  90. # apply GEOCITIES_FIX and pass it through
  91. _print('[>] {0}'.format(archived_url))
  92. split = archived_url.split('/')
  93. hostname = split[2] = 'www.oocities.org'
  94. request_url = '/'.join(split)
  95. conn = urllib.request.urlopen(request_url)
  96. else:
  97. # get from Wayback
  98. _print('[>] {0}'.format(archived_url))
  99. request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
  100. conn = urllib.request.urlopen(request_url)
  101. except urllib.error.HTTPError as e:
  102. # an error has been found
  103. if e.code in (403, 404, 412):
  104. # 403, 404 or tolerance exceeded => heuristically determine the static URL for some redirect scripts
  105. match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\.(?:[^/%]+))(?:%2F|/).+)''', archived_url, re.IGNORECASE)
  106. if not match:
  107. match = re.search('''(?:\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.IGNORECASE)
  108. if match:
  109. print(match.groups())
  110. # we found it
  111. new_url = urllib.parse.unquote_plus(match.group(1))
  112. # add protocol if the URL is absolute but missing a protocol
  113. if new_url[0] != '/' and '://' not in new_url:
  114. new_url = 'http://' + new_url
  115. _print('[r]', new_url)
  116. return self.redirect_page(http_version, new_url)
  117. elif e.code in (301, 302):
  118. # 301 or 302 => urllib-generated error about an infinite redirect loop
  119. _print('[!] Infinite redirect loop')
  120. return self.error_page(http_version, 508, 'Infinite Redirect Loop')
  121. if e.code != 412: # tolerance exceeded has its own error message above
  122. _print('[!] {0} {1}'.format(e.code, e.reason))
  123. # If the memento Link header is present, this is a website error
  124. # instead of a Wayback error. Pass it along if that's the case.
  125. if 'Link' in e.headers:
  126. conn = e
  127. else:
  128. return self.error_page(http_version, e.code, e.reason)
  129. # get content type
  130. content_type = conn.info().get('Content-Type')
  131. if content_type == None: content_type = 'text/html'
  132. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  133. # set the mode: [0]wayback [1]oocities
  134. mode = 0
  135. if GEOCITIES_FIX and hostname in ['www.oocities.org', 'www.oocities.com']: mode = 1
  136. if 'text/html' in content_type: # HTML
  137. # Some dynamically generated links may end up pointing to
  138. # web.archive.org. Correct that by redirecting the Wayback
  139. # portion of the URL away if it ends up being HTML consumed
  140. # through the QUICK_IMAGES interface.
  141. if hostname == 'web.archive.org':
  142. conn.close()
  143. archived_url = '/'.join(request_url.split('/')[5:])
  144. _print('[r] [QI]', archived_url)
  145. return self.redirect_page(http_version, archived_url, 301)
  146. # check if the date is within tolerance
  147. if DATE_TOLERANCE is not None:
  148. match = re.search('''//web\.archive\.org/web/([0-9]+)''', conn.geturl())
  149. if match:
  150. requested_date = match.group(1)
  151. if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(DATE_TOLERANCE):
  152. _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
  153. conn.close()
  154. return self.error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
  155. # consume all data
  156. data = conn.read()
  157. # patch the page
  158. if mode == 0: # wayback
  159. if b'<title>Wayback Machine</title>' in data:
  160. match = re.search(b'<iframe id="playback" src="((?:(?:http(?:s)?:)?//web.archive.org)?/web/[^"]+)"', data)
  161. if match:
  162. # media playback iframe
  163. # Some websites (especially ones that use frames)
  164. # inexplicably render inside a media playback iframe.
  165. # In that case, a simple redirect would result in a
  166. # redirect loop. Download the URL and render it instead.
  167. request_url = match.group(1).decode('ascii', 'ignore')
  168. archived_url = '/'.join(request_url.split('/')[5:])
  169. print('[f]', archived_url)
  170. try:
  171. conn = urllib.request.urlopen(request_url)
  172. except urllib.error.HTTPError as e:
  173. _print('[!]', e.code, e.reason)
  174. # If the memento Link header is present, this is a website error
  175. # instead of a Wayback error. Pass it along if that's the case.
  176. if 'Link' in e.headers:
  177. conn = e
  178. else:
  179. return self.error_page(http_version, e.code, e.reason)
  180. content_type = conn.info().get('Content-Type')
  181. if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1: content_type = content_type[:content_type.find(';')]
  182. data = conn.read()
  183. if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
  184. match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/([^/]+)/([^"]+)">Impatient\?</a></p>', data)
  185. if match:
  186. # wayback redirect page, follow it
  187. match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
  188. try:
  189. redirect_code = int(match2.group(1))
  190. except:
  191. redirect_code = 302
  192. archived_url = match.group(2).decode('ascii', 'ignore')
  193. date_cache[effective_date + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
  194. print('[r]', archived_url)
  195. return self.redirect_page(http_version, archived_url, redirect_code)
  196. # pre-toolbar scripts and CSS
  197. data = re.sub(b'<script src="//archive\.org/(?:.*)<!-- End Wayback Rewrite JS Include -->', b'', data, flags=re.S)
  198. # toolbar
  199. data = re.sub(b'<!-- BEGIN WAYBACK TOOLBAR INSERT -->(?:.*)<!-- END WAYBACK TOOLBAR INSERT -->', b'', data, flags=re.S)
  200. # comments on footer
  201. data = re.sub(b'\n<!--\n FILE ARCHIVED (?:.*)$', b'', data, flags=re.S)
  202. # fix base tag
  203. data = re.sub(b'(<base (?:[^>]*)href=(?:["\'])?)(?:(?:http(?:s)?:)?//web.archive.org)?/web/(?:[^/]+)/', b'\\1', data, flags=re.I + re.S)
  204. # remove extraneous :80 from links
  205. data = re.sub(b'((?:(?:http(?:s)?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
  206. # fix links
  207. if QUICK_IMAGES:
  208. # QUICK_IMAGES works by intercepting asset URLs (those
  209. # with a date code ending in im_, js_...) and letting the
  210. # proxy pass them through. This may reduce load time
  211. # because Wayback doesn't have to hunt down the closest
  212. # copy of that asset to DATE, as those URLs have specific
  213. # date codes. This taints the HTML with web.archive.org
  214. # URLs. QUICK_IMAGES=2 uses the original URLs with an added
  215. # username:password, which taints less but is not supported
  216. # by all browsers - IE6 notably kills the whole page if it
  217. # sees an iframe pointing to an invalid URL.
  218. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
  219. QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
  220. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
  221. else:
  222. #data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
  223. def add_to_date_cache(match):
  224. orig_url = match.group(2)
  225. date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
  226. return orig_url
  227. data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/([^"\'#<>]+)', add_to_date_cache, data)
  228. elif mode == 1: # oocities
  229. # viewport/cache-control/max-width code (header)
  230. data = re.sub(b'^(?:.*?)\n\n', b'', data, flags=re.S)
  231. # archive notice and tracking code (footer)
  232. data = re.sub(b'<style> \n.zoomout { -webkit-transition: (?:.*)$', b'', data, flags=re.S)
  233. # clearly labeled snippets from Geocities
  234. data = re.sub(b'^(?:.*)<\!-- text above generated by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
  235. data = re.sub(b'<\!-- following code added by server\. PLEASE REMOVE -->(?:.*)<\!-- preceding code added by server\. PLEASE REMOVE -->', b'', data, flags=re.S)
  236. data = re.sub(b'<\!-- text below generated by server\. PLEASE REMOVE -->(?:.*)$', b'', data, flags=re.S)
  237. # fix links
  238. data = re.sub(b'//([^.]*)\.oocities\.com/', b'//\\1.geocities.com/', data, flags=re.S)
  239. self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
  240. self.request.sendall(data)
  241. else: # other data
  242. self.request.sendall('{0} 200 OK\r\nContent-Type: {1}\r\nETag: "{2}"\r\n\r\n'.format(http_version, content_type, request_url.replace('"', '')).encode('ascii', 'ignore'))
  243. while True:
  244. data = conn.read(1024)
  245. if not data: break
  246. self.request.sendall(data)
  247. self.request.close()
  248. def error_page(self, http_version, code, reason):
  249. """Generate an error page."""
  250. # make error page
  251. errorpage = '<html><head><title>{0} {1}</title><script language="javascript">if (window.self != window.top && !(window.frameElement && window.frameElement.tagName == "FRAME")) {{ document.location.href = "about:blank"; }}</script></head><body><h1>{1}</h1><p>'.format(code, reason)
  252. # add code information
  253. if code in (404, 508): # page not archived or redirect loop
  254. errorpage += 'This page may not be archived by the Wayback Machine.'
  255. elif code == 403: # not crawled due to robots.txt
  256. errorpage += 'This page was not archived due to a robots.txt block.'
  257. elif code == 501: # method not implemented
  258. errorpage += 'WaybackProxy only implements the GET method.'
  259. elif code == 412: # outside of tolerance
  260. errorpage += 'The earliest snapshot for this page is outside of the configured tolerance interval.'
  261. else: # another error
  262. errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  263. errorpage += '</p><hr><i>'
  264. errorpage += self.signature()
  265. errorpage += '</i></body></html>'
  266. # add padding for IE
  267. if len(errorpage) <= 512:
  268. padding = '\n<!-- This comment pads the HTML so Internet Explorer displays this error page instead of its own. '
  269. remainder = 510 - len(errorpage) - len(padding)
  270. if remainder > 0:
  271. padding += ' ' * remainder
  272. padding += '-->'
  273. errorpage += padding
  274. # send error page and stop
  275. self.request.sendall('{0} {1} {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, reason, len(errorpage), errorpage).encode('utf8', 'ignore'))
  276. self.request.close()
  277. def redirect_page(self, http_version, target, code=302):
  278. """Generate a redirect page."""
  279. # make redirect page
  280. redirectpage = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url='
  281. redirectpage += target
  282. redirectpage += '"></head><body><p>If you are not redirected, <a href="'
  283. redirectpage += target
  284. redirectpage += '">click here</a>.</p></body></html>'
  285. # send redirect page and stop
  286. self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
  287. self.request.close()
  288. def handle_settings(self, query):
  289. """Generate the settings page."""
  290. global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
  291. if query != '': # handle any parameters that may have been sent
  292. parsed = urllib.parse.parse_qs(query)
  293. if 'date' in parsed and DATE != parsed['date'][0]:
  294. DATE = parsed['date'][0]
  295. date_cache.clear()
  296. GEOCITIES_FIX = 'gcFix' in parsed
  297. QUICK_IMAGES = 'quickImages' in parsed
  298. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  299. # send the page and stop
  300. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  301. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  302. settingspage += self.signature()
  303. settingspage += '</b></p><form method="get" action="/"><p>Date to get pages from: <input type="text" name="date" size="8" value="'
  304. settingspage += DATE
  305. settingspage += '"><br><input type="checkbox" name="gcFix"'
  306. if GEOCITIES_FIX: settingspage += ' checked'
  307. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  308. if QUICK_IMAGES: settingspage += ' checked'
  309. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  310. if CONTENT_TYPE_ENCODING: settingspage += ' checked'
  311. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  312. self.request.send(settingspage.encode('utf8', 'ignore'))
  313. self.request.close()
  314. def signature(self):
  315. """Return the server signature."""
  316. return 'WaybackProxy on {0}'.format(socket.gethostname())
  317. def wayback_to_datetime(self, date):
  318. """Convert a Wayback format date string to a datetime.datetime object."""
  319. # parse the string
  320. year = 1995
  321. month = 12
  322. day = 31
  323. hour = 0
  324. minute = 0
  325. second = 0
  326. if len(date) > 0:
  327. year = int(date[:4])
  328. if len(date) > 4:
  329. month = int(date[4:6])
  330. if len(date) > 6:
  331. day = int(date[6:8])
  332. if len(date) > 8:
  333. hour = int(date[8:10])
  334. if len(date) > 10:
  335. minute = int(date[10:12])
  336. if len(date) > 12:
  337. second = int(date[12:14])
  338. # sanitize the numbers
  339. if month < 1:
  340. month = 1
  341. elif month > 12:
  342. month = 12
  343. if day < 1:
  344. day = 1
  345. elif day > 31:
  346. day = 31
  347. if hour > 23:
  348. hour = 23
  349. elif hour < 0:
  350. hour = 0
  351. if minute > 59:
  352. minute = 59
  353. elif minute < 0:
  354. minute = 0
  355. if second > 59:
  356. second = 59
  357. elif second < 0:
  358. second = 0
  359. # if the day is invalid for that month, work its way down
  360. try:
  361. dt = datetime.datetime(year, month, day, hour, minute, second) # max 31
  362. except:
  363. try:
  364. dt = datetime.datetime(year, month, day - 1, hour, minute, second) # max 30
  365. except:
  366. try:
  367. dt = datetime.datetime(year, month, day - 2, hour, minute, second) # max 29
  368. except:
  369. dt = datetime.datetime(year, month, day - 3, hour, minute, second) # max 28
  370. return dt
  371. print_lock = threading.Lock()
  372. def _print(*args, linebreak=True):
  373. """Logging function."""
  374. if SILENT: return
  375. s = ' '.join([str(x) for x in args])
  376. print_lock.acquire()
  377. sys.stdout.write(linebreak and (s + '\n') or s)
  378. sys.stdout.flush()
  379. print_lock.release()
  380. def main():
  381. """Starts the server."""
  382. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  383. _print('[-] Now listening on port {0}'.format(LISTEN_PORT))
  384. try:
  385. server.serve_forever()
  386. except KeyboardInterrupt: # Ctrl+C to stop
  387. pass
  388. if __name__ == '__main__':
  389. main()