waybackproxy.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. #!/usr/bin/env python3
  2. import base64, collections, datetime, json, re, socket, socketserver, string, sys, threading, time, traceback, urllib.parse
  3. try:
  4. import urllib3
  5. except ImportError:
  6. print('WaybackProxy now requires urllib3 to be installed. Follow setup step 3 on the readme to fix this.')
  7. sys.exit(1)
  8. from config_handler import *
  9. class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
  10. """TCPServer with ThreadingMixIn added."""
  11. pass
  12. # http://code.activestate.com/recipes/580644-lru-dictionary/
  13. class LRUDict(collections.OrderedDict):
  14. '''An dict that can discard least-recently-used items, either by maximum capacity
  15. or by time to live.
  16. An item's ttl is refreshed (aka the item is considered "used") by direct access
  17. via [] or get() only, not via iterating over the whole collection with items()
  18. for example.
  19. Expired entries only get purged after insertions or changes. Either call purge()
  20. manually or check an item's ttl with ttl() if that's unacceptable.
  21. '''
  22. def __init__(self, *args, maxduration=None, maxsize=128, **kwargs):
  23. '''Same arguments as OrderedDict with these 2 additions:
  24. maxduration: number of seconds entries are kept. 0 or None means no timelimit.
  25. maxsize: maximum number of entries being kept.'''
  26. super().__init__(*args, **kwargs)
  27. self.maxduration = maxduration
  28. self.maxsize = maxsize
  29. self.purge()
  30. def purge(self):
  31. '''Removes expired or overflowing entries.'''
  32. if self.maxsize:
  33. # pop until maximum capacity is reached
  34. overflowing = max(0, len(self) - self.maxsize)
  35. for _ in range(overflowing):
  36. self.popitem(last=False)
  37. if self.maxduration:
  38. # expiration limit
  39. limit = time.time() - self.maxduration
  40. # as long as there are still items in the dictionary
  41. while self:
  42. # look at the oldest (front)
  43. _, lru = next(iter(super().values()))
  44. # if it is within the timelimit, we're fine
  45. if lru > limit:
  46. break
  47. # otherwise continue to pop the front
  48. self.popitem(last=False)
  49. def __getitem__(self, key):
  50. # retrieve item
  51. value = super().__getitem__(key)[0]
  52. # update lru time
  53. super().__setitem__(key, (value, time.time()))
  54. self.move_to_end(key)
  55. return value
  56. def get(self, key, default=None):
  57. try:
  58. return self[key]
  59. except KeyError:
  60. return default
  61. def ttl(self, key):
  62. '''Returns the number of seconds this item will live.
  63. The item might still be deleted if maxsize is reached.
  64. The time to live can be negative, as for expired items
  65. that have not been purged yet.'''
  66. if self.maxduration:
  67. lru = super().__getitem__(key)[1]
  68. return self.maxduration - (time.time() - lru)
  69. def __setitem__(self, key, value):
  70. super().__setitem__(key, (value, time.time()))
  71. self.purge()
  72. def items(self):
  73. # remove ttl from values
  74. return ((k, v) for k, (v, _) in super().items())
  75. def values(self):
  76. # remove ttl from values
  77. return (v for v, _ in super().values())
  78. class SharedState:
  79. """Class for storing shared state across instances of Handler."""
  80. def __init__(self):
  81. # Create urllib3 connection pool.
  82. self.http = urllib3.PoolManager(maxsize=4, block=True)
  83. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  84. # Create internal LRU dictionary for preserving URLs on redirect.
  85. self.date_cache = LRUDict(maxduration=86400, maxsize=1024)
  86. # Create internal LRU dictionary for date availability.
  87. self.availability_cache = LRUDict(maxduration=86400, maxsize=1024)
  88. # Read domain whitelist file.
  89. try:
  90. with open('whitelist.txt', 'r') as f:
  91. self.whitelist = f.read().splitlines()
  92. except:
  93. self.whitelist = []
  94. shared_state = SharedState()
  95. class Handler(socketserver.BaseRequestHandler):
  96. """Main request handler."""
  97. def setup(self, *args, **kwargs):
  98. """Set up this instance of Handler."""
  99. super().setup(*args, **kwargs)
  100. # Store a local pointer to SharedState.
  101. self.shared_state = shared_state
  102. def handle(self):
  103. """Handle a request."""
  104. # readline is pretty convenient
  105. f = self.request.makefile()
  106. # read request line
  107. reqline = line = f.readline()
  108. split = line.rstrip().split(' ')
  109. http_version = len(split) > 2 and split[2].upper() or 'HTTP/0.9'
  110. if len(split) < 2 or split[0].upper() != 'GET':
  111. # only GET is implemented
  112. return self.send_error_page(http_version, 501, 'Not Implemented', extra=split[0])
  113. # read out the headers
  114. request_host = None
  115. pac_host = '" + location.host + ":' + str(LISTEN_PORT) # may not actually work
  116. effective_date = DATE
  117. auth = None
  118. while line.strip() != '':
  119. line = f.readline()
  120. ll = line.lower()
  121. if ll[:6] == 'host: ':
  122. pac_host = request_host = line[6:].rstrip()
  123. if ':' not in pac_host: # explicitly specify port if running on port 80
  124. pac_host += ':80'
  125. elif ll[:21] == 'x-waybackproxy-date: ':
  126. # API for a personal project of mine
  127. effective_date = line[21:].rstrip()
  128. elif ll[:21] == 'authorization: basic ':
  129. # asset date code passed as username:password
  130. auth = base64.b64decode(ll[21:])
  131. # parse the URL
  132. pac_file_paths = ('/proxy.pac', '/wpad.dat', '/wpad.da')
  133. if split[1][0] == '/' and split[1] not in pac_file_paths:
  134. # just a path (not corresponding to a PAC file) => transparent proxy
  135. # Host header and therefore HTTP/1.1 are required
  136. if not request_host:
  137. return self.send_error_page(http_version, 400, 'Host header missing')
  138. archived_url = 'http://' + request_host + split[1]
  139. else:
  140. # full URL => explicit proxy
  141. archived_url = split[1]
  142. request_url = archived_url
  143. parsed = urllib.parse.urlparse(request_url)
  144. # make a path
  145. path = parsed.path
  146. if parsed.query:
  147. path += '?' + parsed.query
  148. elif path == '':
  149. path == '/'
  150. # get the hostname for later
  151. host = parsed.netloc.split(':')
  152. hostname = host[0]
  153. # get cached date for redirects, if available
  154. original_date = effective_date
  155. effective_date = self.shared_state.date_cache.get(str(effective_date) + '\x00' + str(archived_url), effective_date)
  156. # get date from username:password, if available
  157. if auth:
  158. effective_date = auth.replace(':', '')
  159. # Effectively handle the request.
  160. try:
  161. if path in pac_file_paths:
  162. # PAC file to bypass QUICK_IMAGES requests if WAYBACK_API is not enabled.
  163. pac = http_version + ''' 200 OK\r\n'''
  164. pac += '''Content-Type: application/x-ns-proxy-autoconfig\r\n'''
  165. pac += '''\r\n'''
  166. pac += '''function FindProxyForURL(url, host)\r\n'''
  167. pac += '''{\r\n'''
  168. if not WAYBACK_API:
  169. pac += ''' if (shExpMatch(url, "http://web.archive.org/web/*") && !shExpMatch(url, "http://web.archive.org/web/??????????????if_/*"))\r\n'''
  170. pac += ''' {\r\n'''
  171. pac += ''' return "DIRECT";\r\n'''
  172. pac += ''' }\r\n'''
  173. pac += ''' return "PROXY ''' + pac_host + '''";\r\n'''
  174. pac += '''}\r\n'''
  175. self.request.sendall(pac.encode('ascii', 'ignore'))
  176. return
  177. elif hostname in self.shared_state.whitelist:
  178. _print('[>] [byp]', archived_url)
  179. elif hostname == 'web.archive.org':
  180. if path[:5] != '/web/':
  181. # Launch settings if enabled.
  182. if SETTINGS_PAGE:
  183. return self.handle_settings(parsed.query)
  184. else:
  185. return self.send_error_page(http_version, 404, 'Not Found')
  186. else:
  187. # Pass requests through to web.archive.org. Required for QUICK_IMAGES.
  188. split = request_url.split('/')
  189. effective_date = split[4]
  190. archived_url = '/'.join(split[5:])
  191. _print('[>] [QI]', archived_url)
  192. elif GEOCITIES_FIX and hostname == 'www.geocities.com':
  193. # Apply GEOCITIES_FIX and pass it through.
  194. _print('[>]', archived_url)
  195. split = archived_url.split('/')
  196. hostname = split[2] = 'www.oocities.org'
  197. request_url = '/'.join(split)
  198. else:
  199. # Get from the Wayback Machine.
  200. _print('[>]', archived_url)
  201. request_url = 'https://web.archive.org/web/{0}if_/{1}'.format(effective_date, archived_url)
  202. # Check Wayback Machine Availability API where applicable, to avoid archived 404 pages and other site errors.
  203. split = request_url.split('/')
  204. if split[2] == 'web.archive.org':
  205. # Remove extraneous :80 from URL.
  206. if ':' in split[5]:
  207. if split[7][-3:] == ':80':
  208. split[7] = split[7][:-3]
  209. elif split[5][-3:] == ':80':
  210. split[5] = split[5][:-3]
  211. # Check availability LRU cache.
  212. availability_url = '/'.join(split[5:])
  213. new_url = self.shared_state.availability_cache.get(availability_url, None)
  214. if new_url:
  215. # In cache => replace URL immediately.
  216. request_url = new_url
  217. elif WAYBACK_API:
  218. # Not in cache => contact API.
  219. try:
  220. availability_endpoint = 'https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14]
  221. availability = json.loads(self.shared_state.http.request('GET', availability_endpoint, timeout=10, retries=1).data)
  222. closest = availability.get('archived_snapshots', {}).get('closest', {})
  223. new_date = closest.get('timestamp', None)
  224. except:
  225. _print('[!] Failed to fetch Wayback availability data')
  226. new_date = None
  227. if new_date and new_date != effective_date[:14]:
  228. # Returned date is different.
  229. new_url = closest['url']
  230. # Add asset tag to the date.
  231. split = new_url.split('/')
  232. if len(effective_date) > 14:
  233. split[4] += effective_date[14:]
  234. else:
  235. split[4] += 'if_'
  236. new_url = '/'.join(split)
  237. # Replace URL and add it to the availability cache.
  238. request_url = self.shared_state.availability_cache[availability_url] = new_url
  239. # Start fetching the URL.
  240. retry = urllib3.util.retry.Retry(total=10, connect=10, read=5, redirect=0, backoff_factor=1)
  241. while True:
  242. conn = self.shared_state.http.urlopen('GET', request_url, redirect=False, retries=retry, preload_content=False)
  243. # Check for redirects.
  244. destination = conn.get_redirect_location()
  245. if destination:
  246. self.drain_conn(conn)
  247. conn.release_conn()
  248. # Check if the redirect goes to a different Wayback URL.
  249. match = re.search('''(?:(?:https?:)?//web.archive.org)?/web/([^/]+/)(.+)''', destination)
  250. if match:
  251. archived_dest = self.sanitize_redirect(match.group(2))
  252. # Check if the archived URL is different.
  253. if archived_dest != archived_url:
  254. # Remove extraneous :80 from URL.
  255. archived_dest = re.sub('''^([^/]*//[^/]+):80''', '\\1', archived_dest)
  256. # Add destination to availability cache and redirect the client.
  257. _print('[r]', archived_dest)
  258. self.shared_state.availability_cache[archived_dest] = 'http://web.archive.org/web/' + match.group(1) + archived_dest
  259. return self.send_redirect_page(http_version, archived_dest, conn.status)
  260. # Not an archived URL or same URL, redirect ourselves.
  261. request_url = destination
  262. continue
  263. # Wayback will add its JavaScript to anything it thinks is JavaScript.
  264. # If this is detected, redirect ourselves through the raw asset interface.
  265. content_type = conn.headers.get('Content-Type')
  266. guessed_content_type = conn.headers.get('X-Archive-Guessed-Content-Type')
  267. if not guessed_content_type:
  268. guessed_content_type = content_type
  269. if 'javascript' in guessed_content_type:
  270. match = re.match('''(https?://web\\.archive\\.org/web/[0-9]+)([^/]*)(.+)''', request_url)
  271. if match and match.group(2) != 'im_':
  272. self.drain_conn(conn)
  273. conn.release_conn()
  274. request_url = match.group(1) + 'im_' + match.group(3)
  275. continue
  276. # This request can proceed.
  277. break
  278. except urllib3.exceptions.MaxRetryError as e:
  279. _print('[!] Fetch retries exceeded:', e.reason)
  280. return self.send_error_page(http_version, 504, 'Gateway Timeout')
  281. except:
  282. # Some other fetch exception has occurred.
  283. _print('[!] Fetch exception:')
  284. traceback.print_exc()
  285. return self.send_error_page(http_version, 502, 'Bad Gateway')
  286. # Check for HTTP errors.
  287. if conn.status != 200:
  288. if conn.status in (403, 404): # not found
  289. if self.guess_and_send_redirect(http_version, archived_url):
  290. self.drain_conn(conn)
  291. conn.release_conn()
  292. return
  293. #elif conn.status in (301, 302): # redirect loop detection currently unused
  294. # self.drain_conn(conn)
  295. # conn.release_conn()
  296. # return self.send_error_page(http_version, 508, 'Infinite Redirect Loop')
  297. if conn.status != 412: # tolerance exceeded has its own error message above
  298. _print('[!]', conn.status, conn.reason)
  299. # If the memento Link header is present, this is a website error
  300. # instead of a Wayback error. Pass it along if that's the case.
  301. if 'Link' not in conn.headers:
  302. self.drain_conn(conn)
  303. conn.release_conn()
  304. return self.send_error_page(http_version, conn.status, conn.reason)
  305. # Adjust content type.
  306. if content_type == None:
  307. content_type = 'text/html'
  308. elif not CONTENT_TYPE_ENCODING:
  309. idx = content_type.find(';')
  310. if idx > -1:
  311. content_type = content_type[:idx]
  312. # Set the archive mode.
  313. if GEOCITIES_FIX and hostname in ('www.oocities.org', 'www.oocities.com'):
  314. mode = 1 # oocities
  315. else:
  316. mode = 0 # Wayback Machine
  317. # Check content type to determine if this is HTML we need to patch.
  318. # Wayback will add its HTML to anything it thinks is HTML.
  319. if 'text/html' in guessed_content_type:
  320. # Some dynamically-generated links may end up pointing to
  321. # web.archive.org. Correct that by redirecting the Wayback
  322. # portion of the URL away if it ends up being HTML consumed
  323. # through the QUICK_IMAGES interface.
  324. if hostname == 'web.archive.org':
  325. self.drain_conn(conn)
  326. conn.release_conn()
  327. archived_url = '/'.join(request_url.split('/')[5:])
  328. _print('[r] [QI]', archived_url)
  329. return self.send_redirect_page(http_version, archived_url, 301)
  330. # Check if the date is within tolerance.
  331. if DATE_TOLERANCE != None:
  332. match = re.search('''(?://web\\.archive\\.org|^)/web/([0-9]+)''', conn.geturl() or '')
  333. if match:
  334. requested_date = match.group(1)
  335. if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
  336. self.drain_conn(conn)
  337. conn.release_conn()
  338. _print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
  339. if not self.guess_and_send_redirect(http_version, archived_url):
  340. self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
  341. return
  342. # Consume all data.
  343. data = conn.read()
  344. conn.release_conn()
  345. # Patch the page.
  346. if mode == 0: # Wayback Machine
  347. # Check if this is a Wayback Machine page.
  348. if b'<title>Wayback Machine</title>' in data:
  349. # Check if this is an exclusion (robots.txt?) error page.
  350. if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data:
  351. return self.send_error_page(http_version, 403, 'URL excluded')
  352. # Check if this is a media playback iframe page.
  353. # Some websites (especially ones that use frames)
  354. # inexplicably render inside a media playback iframe.
  355. # In that case, a simple redirect would result in a
  356. # redirect loop, so fetch and render the URL instead.
  357. match = re.search(b'''<iframe id="playback" src="((?:(?:https?:)?//web.archive.org)?/web/[^"]+)"''', data)
  358. if match:
  359. # Extract the content URL.
  360. request_url = match.group(1).decode('ascii', 'ignore')
  361. archived_url = '/'.join(request_url.split('/')[5:])
  362. # Start fetching the URL.
  363. _print('[f]', archived_url)
  364. conn = self.shared_state.http.urlopen('GET', request_url, retries=retry, preload_content=False)
  365. if conn.status != 200:
  366. _print('[!]', conn.status, conn.reason)
  367. # If the memento Link header is present, this is a website error
  368. # instead of a Wayback error. Pass it along if that's the case.
  369. if 'Link' not in conn.headers:
  370. self.drain_conn(conn)
  371. conn.release_conn()
  372. return self.send_error_page(http_version, conn.status, conn.reason)
  373. # Identify content type so we don't modify non-HTML content.
  374. content_type = conn.headers.get('Content-Type')
  375. if not CONTENT_TYPE_ENCODING:
  376. idx = content_type.find(';')
  377. if idx > -1:
  378. content_type = content_type[:idx]
  379. if 'text/html' in content_type:
  380. # Consume all data and proceed with patching the page.
  381. data = conn.read()
  382. conn.release_conn()
  383. else:
  384. # Pass non-HTML data through.
  385. return self.send_passthrough(conn, http_version, content_type, request_url)
  386. # Check if this is a Wayback Machine redirect page.
  387. if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
  388. match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
  389. if match:
  390. # Sanitize the URL.
  391. archived_url = self.sanitize_redirect(match.group(2).decode('ascii', 'ignore'))
  392. # Add URL to the date LRU cache.
  393. self.shared_state.date_cache[str(effective_date) + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
  394. # Get the original HTTP redirect code.
  395. match = re.search(b'''<p class="code shift red">Got an HTTP ([0-9]+)''', data)
  396. try:
  397. redirect_code = int(match.group(1))
  398. except:
  399. redirect_code = 302
  400. # Redirect client to the URL.
  401. _print('[r]', archived_url)
  402. return self.send_redirect_page(http_version, archived_url, redirect_code)
  403. # Remove pre-toolbar scripts and CSS.
  404. data = re.sub(b'''(?:<!-- is_embed=True -->\\r?\\n?)?<script (?:type="text/javascript" )?src="[^"]*/_static/js/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, count=1, flags=re.S)
  405. # Remove toolbar. The if_ asset tag serves no toolbar, but we remove it just in case.
  406. data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, count=1, flags=re.S)
  407. # Remove comments on footer.
  408. data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
  409. # Fix base tag.
  410. data = re.sub(b'''(<base\\s+[^>]*href=["']?)(?:(?:https?:)?//web.archive.org)?/web/[^/]+/(?:[^:/]+://)?''', b'\\1http://', data, flags=re.I + re.S)
  411. # Remove extraneous :80 from links.
  412. data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^/:]+)://([^/:]+):80/', b'\\1\\2/\\3://\\4/', data)
  413. # Fix links.
  414. if QUICK_IMAGES:
  415. # QUICK_IMAGES works by intercepting asset URLs (those
  416. # with a date code ending in im_, js_...) and letting the
  417. # proxy pass them through. This may reduce load time
  418. # because Wayback doesn't have to hunt down the closest
  419. # copy of that asset to DATE, as those URLs have specific
  420. # date codes. This taints the HTML with web.archive.org
  421. # URLs. QUICK_IMAGES=2 uses the original URLs with an added
  422. # username:password, which taints less but is not supported
  423. # by all browsers - IE notably kills the whole page if it
  424. # sees an iframe pointing to an invalid URL.
  425. def filter_asset(match):
  426. if match.group(2) in (None, b'if_', b'fw_'): # non-asset URL
  427. return match.group(3) == b'https://' and b'http://' or match.group(3) # convert secure non-asset URLs to regular HTTP
  428. asset_type = match.group(2)
  429. if asset_type == b'js_': # cut down on the JavaScript detector's second request
  430. asset_type = b'im_'
  431. if QUICK_IMAGES == 2:
  432. return b'http://' + match.group(1) + b':' + asset_type + b'@'
  433. else:
  434. return b'http://web.archive.org/web/' + match.group(1) + asset_type + b'/' + match.group(3)
  435. data = re.sub(b'(?:(?:https?:)?//web.archive.org)?/web/([0-9]+)([a-z]+_)?/([^:/]+:(?://)?)', filter_asset, data)
  436. else:
  437. # Remove asset URLs while simultaneously adding them to the date LRU cache
  438. # with their respective date and converting secure URLs to regular HTTP.
  439. def add_to_date_cache(match):
  440. orig_url = match.group(2)
  441. if orig_url[:8] == b'https://':
  442. orig_url = b'http://' + orig_url[8:]
  443. self.shared_state.date_cache[str(effective_date) + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore').replace('js_', 'im_')
  444. return orig_url
  445. data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
  446. elif mode == 1: # oocities
  447. # Remove viewport/cache-control/max-width code from the header.
  448. data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
  449. # Remove archive notice and tracking code from the footer.
  450. data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
  451. # Remove clearly labeled snippets from Geocities.
  452. data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  453. data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
  454. data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
  455. # Fix links.
  456. data = re.sub(b'''//([^\\.]*\\.)?oocities\\.com/''', b'//\\1geocities.com/', data, flags=re.S)
  457. # Send patched page.
  458. self.send_response_headers(conn, http_version, content_type, request_url, content_length=len(data))
  459. self.request.sendall(data)
  460. self.request.close()
  461. else:
  462. # Pass non-HTML data through.
  463. self.send_passthrough(conn, http_version, content_type, request_url)
  464. def send_passthrough(self, conn, http_version, content_type, request_url):
  465. """Pass data through to the client unmodified (save for our headers)."""
  466. self.send_response_headers(conn, http_version, content_type, request_url, content_length=True)
  467. for data in conn.stream(1024):
  468. self.request.sendall(data)
  469. conn.release_conn()
  470. self.request.close()
  471. def send_response_headers(self, conn, http_version, content_type, request_url, content_length=False):
  472. """Generate and send the response headers."""
  473. # Pass the HTTP version, and error code if there is one.
  474. response = '{0} {1} {2}'.format(http_version, conn.status, conn.reason.replace('\n', ' '))
  475. # Add Content-Type, Content-Length and the caching ETag.
  476. response += '\r\nContent-Type: ' + content_type
  477. if type(content_length) == int:
  478. response += '\r\nContent-Length: ' + str(content_length)
  479. content_length = False # don't pass the original length through
  480. response += '\r\nETag: "' + request_url.replace('"', '') + '"'
  481. response += '\r\nConnection: close' # helps with IE6 trying to use proxy keep alive and holding half-open connections
  482. # Pass X-Archive-Orig-* (and Content-Length if requested) headers through.
  483. for header in conn.headers:
  484. if header.find('X-Archive-Orig-') == 0:
  485. orig_header = header[15:]
  486. # Skip headers which may affect client behavior.
  487. if orig_header.lower() not in ('connection', 'location', 'content-type', 'content-length', 'etag', 'authorization', 'set-cookie'):
  488. response += '\r\n' + orig_header + ': ' + conn.headers[header]
  489. elif content_length and header.lower() == 'content-length':
  490. response += '\r\n' + header + ': ' + conn.headers[header]
  491. # Finish and send the request.
  492. response += '\r\n\r\n'
  493. self.request.sendall(response.encode('utf8', 'ignore'))
  494. def send_error_page(self, http_version, code, reason, extra=''):
  495. """Generate an error page."""
  496. # Get a description for this error code.
  497. if code in (404, 508): # page not archived or redirect loop
  498. description = 'This page may not be archived by the Wayback Machine.'
  499. elif code == 403: # not crawled due to exclusion
  500. description = 'This page was not archived due to a Wayback Machine exclusion.'
  501. elif code == 501: # method not implemented
  502. description = 'WaybackProxy only implements the GET method. Your browser sent a request with the {0} method.'.format(extra.upper())
  503. elif code == 502: # exception
  504. description = 'This page could not be fetched due to an unknown error.'
  505. elif code == 504: # timeout
  506. description = 'This page could not be fetched due to a Wayback Machine server error.'
  507. elif code == 412: # outside of tolerance
  508. description = 'The earliest snapshot for this page is outside of the configured tolerance interval.'
  509. elif code == 400 and reason == 'Host header missing': # no host header in transparent mode
  510. description = 'WaybackProxy\'s transparent mode requires an HTTP/1.1-compliant client.'
  511. else: # another error
  512. description = 'Unknown error. The Wayback Machine may be experiencing technical difficulties.'
  513. # Read error page file.
  514. try:
  515. with open('error.html', 'r', encoding='utf8', errors='ignore') as f:
  516. error_page = f.read()
  517. except:
  518. # Just send the code and reason as a backup.
  519. error_page = '${code} ${reason}'
  520. # Format error page template.
  521. signature = self.signature()
  522. error_page = string.Template(error_page).substitute(**locals()).encode('utf8', 'ignore')
  523. error_page_len = len(error_page)
  524. # Send formatted error page and stop.
  525. self.request.sendall(
  526. '{http_version} {code} {reason}\r\n'
  527. 'Content-Type: text/html\r\n'
  528. 'Content-Length: {error_page_len}\r\n'
  529. '\r\n'
  530. .format(**locals()).encode('utf8', 'ignore')
  531. )
  532. self.request.sendall(error_page)
  533. self.request.close()
  534. def send_redirect_page(self, http_version, target, code=302):
  535. """Generate a redirect page."""
  536. # Make redirect page.
  537. redirect_page = '<html><head><title>Redirect</title><meta http-equiv="refresh" content="0;url=${target}"></head><body><p>If you are not redirected, <a href="${target}">click here</a>.</p></body></html>'
  538. redirect_page = string.Template(redirect_page).substitute(**locals()).encode('utf8', 'ignore')
  539. redirect_page_len = len(redirect_page)
  540. # Send redirect page and stop.
  541. self.request.sendall(
  542. '{http_version} {code} Found\r\n'
  543. 'Location: {target}\r\n'
  544. 'Content-Type: text/html\r\n'
  545. 'Content-Length: {redirect_page_len}\r\n'
  546. '\r\n'
  547. .format(**locals()).encode('utf8', 'ignore')
  548. )
  549. self.request.sendall(redirect_page)
  550. self.request.close()
  551. def guess_and_send_redirect(self, http_version, guess_url):
  552. # Heuristically determine the static URL for some redirect scripts.
  553. parsed = urllib.parse.urlparse(guess_url)
  554. match = re.search('''(?:^|&)[^=]+=((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)?(?:%2F|/)[^&]+)''', parsed.query, re.I) # URL in query parameters
  555. if not match:
  556. full_path = parsed.path
  557. if parsed.query:
  558. full_path += '?' + parsed.query
  559. match = re.search('''((?:https?(?:%3A|:)(?:%2F|/)|www[0-9]*\\.[^/%]+)(?:(?:%2F|/).+|$))''', full_path, re.I) # URL in path or full query
  560. if match: # found URL
  561. # Decode and sanitize the URL.
  562. new_url = self.sanitize_redirect(urllib.parse.unquote_plus(match.group(1)))
  563. # Redirect client to the URL.
  564. _print('[r] [g]', new_url)
  565. self.send_redirect_page(http_version, new_url)
  566. return True
  567. return False
  568. def handle_settings(self, query):
  569. """Generate the settings page."""
  570. global DATE, DATE_TOLERANCE, GEOCITIES_FIX, QUICK_IMAGES, WAYBACK_API, CONTENT_TYPE_ENCODING, SILENT, SETTINGS_PAGE
  571. if query != '': # handle any parameters that may have been sent
  572. parsed = urllib.parse.parse_qs(query)
  573. if 'date' in parsed and 'dateTolerance' in parsed:
  574. if DATE != parsed['date'][0]:
  575. DATE = parsed['date'][0]
  576. self.shared_state.date_cache.clear()
  577. self.shared_state.availability_cache.clear()
  578. if DATE_TOLERANCE != parsed['dateTolerance'][0]:
  579. DATE_TOLERANCE = parsed['dateTolerance'][0]
  580. GEOCITIES_FIX = 'gcFix' in parsed
  581. QUICK_IMAGES = 'quickImages' in parsed
  582. CONTENT_TYPE_ENCODING = 'ctEncoding' in parsed
  583. # send the page and stop
  584. settingspage = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n'
  585. settingspage += '<html><head><title>WaybackProxy Settings</title></head><body><p><b>'
  586. settingspage += self.signature()
  587. settingspage += '</b></p><form method="get" action="/">'
  588. settingspage += '<p>Date to get pages from: <input type="text" name="date" size="8" value="'
  589. settingspage += str(DATE)
  590. settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
  591. settingspage += str(DATE_TOLERANCE)
  592. settingspage += '"> days<br><input type="checkbox" name="gcFix"'
  593. if GEOCITIES_FIX:
  594. settingspage += ' checked'
  595. settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
  596. if QUICK_IMAGES:
  597. settingspage += ' checked'
  598. settingspage += '> Quick images<br><input type="checkbox" name="ctEncoding"'
  599. if CONTENT_TYPE_ENCODING:
  600. settingspage += ' checked'
  601. settingspage += '> Encoding in Content-Type</p><p><input type="submit" value="Save"></p></form></body></html>'
  602. self.request.send(settingspage.encode('utf8', 'ignore'))
  603. self.request.close()
  604. def sanitize_redirect(self, url):
  605. """Sanitize an URL for client-side redirection."""
  606. if url[0] != '/' and '://' not in url:
  607. # Add protocol if the URL is absolute but missing a protocol.
  608. return 'http://' + url
  609. elif url[:8].lower() == 'https://':
  610. # Convert secure URLs to regular HTTP.
  611. return 'http://' + url[8:]
  612. else:
  613. # No changes required.
  614. return url
  615. def signature(self):
  616. """Return the server signature."""
  617. return 'WaybackProxy on {0}'.format(socket.gethostname())
  618. def wayback_to_datetime(self, date):
  619. """Convert a Wayback format date string to a datetime.datetime object."""
  620. date = str(date)
  621. fmt = '%Y%m%d%H%M%S'
  622. fmt_len = 14
  623. while fmt:
  624. try:
  625. return datetime.datetime.strptime(date[:fmt_len], fmt)
  626. except:
  627. fmt = fmt[:-2]
  628. fmt_len -= 2
  629. def drain_conn(self, conn):
  630. getattr(conn, 'drain_conn', conn.read)()
  631. print_lock = threading.Lock()
  632. def _print(*args, **kwargs):
  633. """Logging function."""
  634. if SILENT:
  635. return
  636. with print_lock:
  637. print(*args, **kwargs, flush=True)
  638. def main():
  639. """Starts the server."""
  640. server = ThreadingTCPServer(('', LISTEN_PORT), Handler)
  641. _print('[-] Now listening on port', LISTEN_PORT)
  642. _print('[-] Date set to', DATE)
  643. try:
  644. server.serve_forever()
  645. except KeyboardInterrupt: # Ctrl+C to stop
  646. pass
  647. if __name__ == '__main__':
  648. main()