waybackproxy.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. #!/usr/bin/env python3
  2. import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, traceback, urllib.request, urllib.error, urllib.parse
  3. from config import *
  4. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  5. """TCPServer with ThreadingMixIn added."""
  6. pass
  7. class SharedState:
  8. """Class for storing shared state across instances of Handler."""
  9. def __init__(self):
  10. # Create internal LRU dictionary for preserving URLs on redirect.
  11. self.date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
  12. # Create internal LRU dictionary for date availability.
  13. self.availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None
  14. shared_state = SharedState()
  15. class Handler(socketserver.BaseRequestHandler):
  16. """Main request handler."""
  17. def setup(self, *args, **kwargs):
  18. """Set up this instance of Handler."""
  19. super().setup(*args, **kwargs)
  20. # Store a local pointer to SharedState.
  21. self.shared_state = shared_state
  22. def handle(self):
  23. """Handle a request."""
  24. # readline is pretty convenient
  25. f = self.request.makefile()
  26. # read request line
  27. reqline = line = f.readline()
  28. split = line.rstrip().split(' ')
  29. http_version = len(split) > 2 and split[2] or 'HTTP/0.9'
  30. if len(split) < 2 or split[0] != 'GET':
  31. # only GET is implemented
  32. return self.send_error_page(http_version, 501, 'Not Implemented')
  33. # read out the headers
  34. request_host = None
  35. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  36. effective_date = DATE
  37. auth = None
  38. while line.strip() != '':
  39. line = f.readline()
  40. ll = line.lower()
  41. if ll[:6] == 'host: ':
  42. pac_host = request_host = line[6:].rstrip()
  43. if ':' not in pac_host: # explicitly specify port if running on port 80
  44. pac_host += ':80'
  45. elif ll[:21] == 'x-waybackproxy-date: ':
  46. # API for a personal project of mine
  47. effective_date = line[21:].rstrip()
  48. elif ll[:21] == 'authorization: basic ':
  49. # asset date code passed as username:password
  50. auth = base64.b64decode(ll[21:])
  51. # parse the URL
  52. pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da')
  53. if split[1][0] == '/' and split[1] not in pac_file_paths:
  54. # just a path (not corresponding to a PAC file) => transparent proxy
  55. # Host header and therefore HTTP/1.1 are required
  56. if not request_host:
  57. return self.send_error_page(http_version, 400, 'Host header missing')
  58. archived_url = 'http://' + request_host + split[1]
  59. else:
  60. # full URL => explicit proxy
  61. archived_url = split[1]
  62. request_url = archived_url
  63. parsed = urllib.parse.urlparse(request_url)
  64. # make a path
  65. path = parsed.path
  66. if parsed.query:
  67. path += '?' + parsed.query
  68. elif path == '':
  69. path == '/'
  70. # get the hostname for later
  71. host = parsed.netloc.split(':')
  72. hostname = host[0]
  73. # get cached date for redirects, if available
  74. original_date = effective_date
  75. effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date)
  76. # get date from username:password, if available
  77. if auth:
  78. effective_date = auth.replace(':', '')
  79. # effectively handle the request
  80. try:
  81. if path in pac_file_paths:
  82. # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled
  83. pac = http_version + ''' 200 OK\r\n'''
  84. pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  85. pac += '''\r\n'''
  86. pac += '''function FindProxyForURL(url, host)\r\n'''
  87. pac += '''{\r\n'''
  88. if self.shared_state.availability_cache == None:
  89. pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
  90. pac += ''' {\r\n'''
  91. pac += ''' return "DIRECT";\r\n'''
  92. pac += ''' }\r\n'''
  93. pac += ''' return "PROXY ''' + pac_host + '''";\r\n'''
  94. pac += '''}\r\n'''
  95. self.request.sendall(pac.encode('ascii', 'ignore'))
  96. return
  97. elif hostname == 'web.archive.org':
  98. if path[:5] != '/web/':
  99. # Launch settings if enabled.
  100. if SETTINGS_PAGE:
  101. return self.handle_settings(parsed.query)
  102. else:
  103. return self.send_error_page(http_version, 404, 'Not Found')
  104. else:
  105. # Pass requests through to web.archive.org. Required for QUICK_IMAGES.
  106. split = request_url.split('/')
  107. effective_date = split[4]
  108. archived_url = '/'.join(split[5:])
  109. _print('[>] [QI]', archived_url)
  110. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  111. # apply GEOCITIES_FIX and pass it through
  112. _print('[>]', archived_url)
  113. split = archived_url.split('/')
  114. hostname = split[2] = 'www.oocities.org'
  115. request_url = '/'.join(split)
  116. else:
  117. # get from Wayback
  118. _print('[>]', archived_url)
  119. request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
  120. if self.shared_state.availability_cache != None:
  121. # are we requesting from Wayback?
  122. split = request_url.split('/')
  123. # if so, get the closest available date from Wayback's API, to avoid archived 404 pages and other site errors
  124. if split[2] == 'web.archive.org':
  125. # remove extraneous :80 from URL
  126. if ':' in split[5]:
  127. if split[7][-3:] == ':80':
  128. split[7] = split[7][:-3]
  129. elif split[5][-3:] == ':80':
  130. split[5] = split[5][:-3]
  131. # check availability LRU cache
  132. availability_url = '/'.join(split[5:])
  133. new_url = self.shared_state.availability_cache.get(availability_url, None)
  134. if new_url:
  135. # in cache => replace URL immediately
  136. request_url = new_url
  137. else:
  138. # not in cache => contact API
  139. try:
  140. availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14], timeout=10).read())
  141. closest = availability.get('archived_snapshots', {}).get('closest', {})
  142. new_date = closest.get('timestamp', None)
  143. except:
  144. _print('[!] Failed to fetch Wayback availability data')
  145. new_date = None
  146. if new_date and new_date != effective_date[:14]:
  147. # returned date is different
  148. new_url = closest['url']
  149. # add asset tag if one is present in the original URL
  150. if len(effective_date) > 14:
  151. split = new_url.split('/')
  152. split[4] += effective_date[14:]
  153. new_url = '/'.join(split)
  154. # replace URL and add it to the availability cache
  155. request_url = self.shared_state.availability_cache[availability_url] = new_url
  156. conn = urllib.request.urlopen(request_url)
  157. except urllib.error.HTTPError as e:
  158. # An HTTP error has been found.
  159. if e.code in (403, 404, 412): # not found or tolerance exceeded
  160. # Heuristically determine the static URL for some redirect scripts.
  161. match = re.search('''[^/]/((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:%2F|/).+)''', archived_url, re.I) # URL in path
  162. if not match:
  163. match = re.search('''[\\?&][^=]+=((?:http(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', archived_url, re.I) # URL in query string
  164. if match: # found URL
  165. # Decode the URL.
  166. new_url = urllib.parse.unquote_plus(match.group(1))
  167. # Add protocol if the URL is absolute but missing a protocol.
  168. if new_url[0] != '/' and '://' not in new_url:
  169. new_url = 'http://' + new_url
  170. # Redirect client to the URL.
  171. _print('[r] [g]', new_url)
  172. return self.send_redirect_page(http_version, new_url)
  173. elif e.code in (301, 302): # urllib-generated error about an infinite redirect loop
  174. _print('[!] Infinite redirect loop')
  175. return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
  176. if e.code != 412: # tolerance exceeded has its own error message above
  177. _print('[!]', e.code, e.reason)
  178. # If the memento Link header is present, this is a website error
  179. # instead of a Wayback error. Pass it along if that's the case.
  180. if 'Link' in e.headers:
  181. conn = e
  182. else:
  183. return self.send_error_page(http_version, e.code, e.reason)
  184. except socket.timeout as e:
  185. _print('[!] Fetch timeout')
  186. return self.send_error_page(http_version, 504, 'Gateway Timeout')
  187. except:
  188. _print('[!] Fetch exception:')
  189. traceback.print_exc()
  190. return self.send_error_page(http_version, 502, 'Bad Gateway')
  191. # get content type
  192. content_type = conn.info().get('Content-Type')
  193. if content_type == None:
  194. content_type = 'text/html'
  195. elif not CONTENT_TYPE_ENCODING:
  196. idx = content_type.find(';')
  197. if idx > -1:
  198. content_type = content_type[:idx]
  199. # set the mode: [0]wayback [1]oocities
  200. if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
  201. mode = 1
  202. else:
  203. mode = 0
  204. # Check content type to determine if this is HTML we need to patch.
  205. # Wayback will add its HTML to anything it thinks is HTML.
  206. guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type')
  207. if not guessed_content_type:
  208. guessed_content_type = content_type
  209. if 'text/html' in guessed_content_type:
  210. # Some dynamically generated links may end up pointing to
  211. # web.archive.org. Correct that by redirecting the Wayback
  212. # portion of the URL away if it ends up being HTML consumed
  213. # through the QUICK_IMAGES interface.
  214. if hostname == 'web.archive.org':
  215. conn.close()
  216. archived_url = '/'.join(request_url.split('/')[5:])
  217. _print('[r] [QI]', archived_url)
  218. return self.send_redirect_page(http_version, archived_url, 301)
  219. # Check if the date is within tolerance.
  220. if DATE_TOLERANCE is not None:
  221. match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl())
  222. if match:
  223. requested_date = match.group(1)
  224. if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
  225. _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
  226. conn.close()
  227. return self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
  228. # Consume all data.
  229. data = conn.read()
  230. # Patch the page.
  231. if mode == 0: # wayback
  232. if b'<title>Wayback Machine</title>' in data:
  233. if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data: # exclusion error (robots.txt?)
  234. return self.send_error_page(http_version, 403, 'URL excluded')
  235. match = re.search(b'''<iframe id="playback" src="((?:(?:https?:)?//web.archive.org)?/web/[^"]+)"''', data)
  236. if match: # media playback iframe
  237. # Some websites (especially ones that use frames)
  238. # inexplicably render inside a media playback iframe.
  239. # In that case, a simple redirect would result in a
  240. # redirect loop. Download the URL and render it instead.
  241. request_url = match.group(1).decode('ascii', 'ignore')
  242. archived_url = '/'.join(request_url.split('/')[5:])
  243. _print('[f]', archived_url)
  244. try:
  245. conn = urllib.request.urlopen(request_url)
  246. except urllib.error.HTTPError as e:
  247. _print('[!]', e.code, e.reason)
  248. # If the memento Link header is present, this is a website error
  249. # instead of a Wayback error. Pass it along if that's the case.
  250. if 'Link' in e.headers:
  251. conn = e
  252. else:
  253. return self.send_error_page(http_version, e.code, e.reason)
  254. # Identify content type so we don't modify non-HTML content.
  255. content_type = conn.info().get('Content-Type')
  256. if not CONTENT_TYPE_ENCODING:
  257. idx = content_type.find(';')
  258. if idx > -1:
  259. content_type = content_type[:idx]
  260. if 'text/html' in content_type:
  261. # Consume all data and proceed with patching the page.
  262. data = conn.read()
  263. else:
  264. # Pass non-HTML data through.
  265. self.send_response_headers(conn, http_version, content_type, request_url)
  266. while True:
  267. data = conn.read(1024)
  268. if not data: break
  269. self.request.sendall(data)
  270. self.request.close()
  271. return
  272. if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
  273. match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
  274. if match:
  275. # This is a Wayback redirect page, follow the redirect.
  276. match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
  277. try:
  278. redirect_code = int(match2.group(1))
  279. except:
  280. redirect_code = 302
  281. archived_url = match.group(2).decode('ascii', 'ignore')
  282. self.shared_state.date_cache[str(effective_date) + '\x00' + str(archived_url)] = match.group(1).decode('ascii', 'ignore')
  283. _print('[r]', archived_url)
  284. return self.send_redirect_page(http_version, archived_url, redirect_code)
  285. # Remove pre-toolbar scripts and CSS.
  286. data = re.sub(b'''<script src="//archive\\.org/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, flags=re.S)
  287. # Remove toolbar.
  288. data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, flags=re.S)
  289. # Remove comments on footer.
  290. data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
  291. # Fix base tag.
  292. data = re.sub(b'''(<base (?:[^>]*)href=(?:["\'])?)(?:(?:https?:)?//web.archive.org)?/web/(?:[^/]+)/''', b'\\1', data, flags=re.I + re.S)
  293. # Remove extraneous :80 from links.
  294. data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
  295. # Fix links.
  296. if QUICK_IMAGES:
  297. # QUICK_IMAGES works by intercepting asset URLs (those
  298. # with a date code ending in im_, js_...) and letting the
  299. # proxy pass them through. This may reduce load time
  300. # because Wayback doesn't have to hunt down the closest
  301. # copy of that asset to DATE, as those URLs have specific
  302. # date codes. This taints the HTML with web.archive.org
  303. # URLs. QUICK_IMAGES=2 uses the original URLs with an added
  304. # username:password, which taints less but is not supported
  305. # by all browsers - IE notably kills the whole page if it
  306. # sees an iframe pointing to an invalid URL.
  307. data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)/([^:]+)://',
  308. QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
  309. data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)/', b'', data) # non-asset
  310. else:
  311. # Remove asset URLs while simultaneously adding them to the
  312. # LRU cache with their respective date.
  313. def add_to_date_cache(match):
  314. orig_url = match.group(2)
  315. self.shared_state.date_cache[str(effective_date) + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
  316. return orig_url
  317. data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
  318. elif mode == 1: # oocities
  319. # Remove viewport/cache-control/max-width code from the header.
  320. data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
  321. # Remove archive notice and tracking code from the footer.
  322. data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
  323. # Remove clearly labeled snippets from Geocities.
  324. data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  325. data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  326. data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
  327. # Fix links.
  328. data = re.sub(b'''//([^.]*)\\.oocities\\.com/''', b'//\\1.geocities.com/', data, flags=re.S)
  329. # Send patched page.
  330. self.send_response_headers(conn, http_version, content_type, request_url)
  331. self.request.sendall(data)
  332. else:
  333. # Pass non-HTML data through.
  334. self.send_response_headers(conn, http_version, content_type, request_url)
  335. while True:
  336. data = conn.read(1024)
  337. if not data: break
  338. self.request.sendall(data)
  339. self.request.close()
  340. def send_response_headers(self, conn, http_version, content_type, request_url):
  341. """Generate and send the response headers."""
  342. response = http_version
  343. # Pass the error code if there is one.
  344. if isinstance(conn, urllib.error.HTTPError):
  345. response += '{0} {1}'.format(conn.code, conn.reason.replace('\n', ' '))
  346. else:
  347. response += '200 OK'
  348. # Add content type, and the ETag for caching.
  349. response += '\r\nContent-Type: ' + content_type + '\r\nETag: "' + request_url.replace('"', '') + '"\r\n'
  350. # Add X-Archive-Orig-* headers.
  351. headers = conn.info()
  352. for header in headers:
  353. if header.find('X-Archive-Orig-') == 0:
  354. orig_header = header[15:]
  355. # Blacklist certain headers which may affect client behavior.
  356. if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
  357. response += orig_header + ': ' + headers[header] + '\r\n'
  358. # Finish and send the request.
  359. response += '\r\n'
  360. self.request.sendall(response.encode('ascii', 'ignore'))
  361. def send_error_page(self, http_version, code, reason):
  362. """Generate an error page."""
  363. # make error page
  364. errorpage = '<html><head><title>{0} {1}</title>'.format(code, reason)
  365. # IE's same-origin policy throws "Access is denied." inside frames
  366. # loaded from a different origin. Use that to our advantage, even
  367. # though regular frames are also affected. IE also doesn't recognize
  368. # language="javascript1.4", so use 1.3 while blocking IE4 by detecting
  369. # the lack of screenLeft as IE4 is quite noisy with script errors.
  370. errorpage += '<script language="javascript1.3">if (window.screenLeft != null) { eval(\'try { var frameElement = window.frameElement; } catch (e) { document.location.href = "about:blank"; }\'); }</script>'
  371. errorpage += '<script language="javascript">if (window.self != window.top && !(window.frameElement && window.frameElement.tagName == "FRAME")) { document.location.href = "about:blank"; }</script>'
  372. errorpage += '</head><body><h1>{0}</h1><p>'.format(reason)
  373. # add code information
  374. if code in (404, 508): # page not archived or redirect loop
  375. errorpage += 'This page may not be archived by the Wayback Machine.'
  376. elif code == 403: # not crawled due to exclusion
  377. errorpage += 'This page was not archived due to a Wayback Machine exclusion.'
  378. elif code == 501: # method not implemented
  379. errorpage += 'WaybackProxy only implements the GET method.'
  380. elif code == 502: # exception
  381. errorpage += 'This page could not be fetched due to an unknown error.'
  382. elif code == 504: # timeout
  383. errorpage += 'This page could not be fetched due to a Wayback Machine server timeout.'
  384. elif code == 412: # outside of tolerance
  385. errorpage += 'The earliest snapshot for this page is outside of the configured tolerance interval.'
  386. elif code == 400 and reason == 'Host header missing': # no host header in transparent mode
  387. errorpage += 'WaybackProxy\'s transparent mode requires an HTTP/1.1 compliant client.'
  388. else: # another error
  389. errorpage += 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  390. errorpage += '</p><hr><i>'
  391. errorpage += self.signature()
  392. errorpage += '</i></body></html>'
  393. # add padding for IE
  394. if len(errorpage) <= 512:
  395. padding = '\n<!-- This comment pads the HTML so Internet Explorer displays this error page instead of its own. '
  396. remainder = 510 - len(errorpage) - len(padding)
  397. if remainder > 0:
  398. padding += ' ' * remainder
  399. padding += '-->'
  400. errorpage += padding
  401. # send error page and stop
  402. self.request.sendall('{0} {1} {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, reason, len(errorpage), errorpage).encode('utf8', 'ignore'))
  403. self.request.close()
  404. def send_redirect_page(self, http_version, target, code=302):
  405. """Generate a redirect page."""
  406. # make redirect page
  407. redirectpage = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url='
  408. redirectpage += target
  409. redirectpage += '"></head><body><p>If you are not redirected, <a href="'
  410. redirectpage += target
  411. redirectpage += '">click here</a>.</p></body></html>'
  412. # send redirect page and stop
  413. self.request.sendall('{0} {1} Found\r\nLocation: {2}\r\nContent-Type: text/html\r\nContent-Length: {3}\r\n\r\n{4}'.format(http_version, code, target, len(redirectpage), redirectpage).encode('utf8', 'ignore'))
  414. self.request.close()
  415. def handle_settings(self, query):
  416. """Generate the settings page."""
  417. global DATE, DATE_TOLERANCE, GEOCITIES_FIX, QUICK_IMAGES, WAYBACK_API, CONTENT_TYPE_ENCODING, SILENT, SETTINGS_PAGE
  418. if query != '': # handle any parameters that may have been sent
  419. parsed = urllib.parse.parse_qs(query)
  420. if 'date' in parsed and 'dateTolerance' in parsed:
  421. if DATE != parsed['date'][0]:
  422. DATE = parsed['date'][0]
  423. self.shared_state.date_cache.clear()
  424. self.shared_state.availability_cache.clear()
  425. if DATE_TOLERANCE != parsed['dateTolerance'][0]:
  426. DATE_TOLERANCE = parsed['dateTolerance'][0]
  427. GEOCITIES_FIX = 'gcFix' in parsed
  428. QUICK_IMAGES = 'quickImages' in parsed
  429. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  430. # send the page and stop
  431. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  432. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  433. settingspage += self.signature()
  434. settingspage += '</b></p><form method="get" action="/">'
  435. settingspage += '<p>Date to get pages from: <input type="text" name="date" size="8" value="'
  436. settingspage += str(DATE)
  437. settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
  438. settingspage += str(DATE_TOLERANCE)
  439. settingspage += '"> days<br><input type="checkbox" name="gcFix"'
  440. if GEOCITIES_FIX:
  441. settingspage += ' checked'
  442. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  443. if QUICK_IMAGES:
  444. settingspage += ' checked'
  445. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  446. if CONTENT_TYPE_ENCODING:
  447. settingspage += ' checked'
  448. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  449. self.request.send(settingspage.encode('utf8', 'ignore'))
  450. self.request.close()
  451. def signature(self):
  452. """Return the server signature."""
  453. return 'WaybackProxy on {0}'.format(socket.gethostname())
  454. def wayback_to_datetime(self, date):
  455. """Convert a Wayback format date string to a datetime.datetime object."""
  456. try:
  457. return datetime.datetime.strptime(str(date)[:14], '%Y%m%d%H%M%S')
  458. except:
  459. return datetime.datetime.strptime(str(date)[:8], '%Y%m%d')
  460. print_lock = threading.Lock()
  461. def _print(*args, **kwargs):
  462. """Logging function."""
  463. if SILENT:
  464. return
  465. with print_lock:
  466. print(*args, **kwargs, flush=True)
  467. def main():
  468. """Starts the server."""
  469. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  470. _print('[-] Now listening on port', LISTEN_PORT)
  471. try:
  472. server.serve_forever()
  473. except KeyboardInterrupt: # Ctrl+C to stop
  474. pass
  475. if __name__ == '__main__':
  476. main()