Переглянути джерело

Add WAYBACK_API option, which queries the Wayback Machine Availability API to find the closest snapshot to a given URL instead of just trying luck.

RichardG867 5 роки тому
батько
коміт
4b96987715
2 змінених файлів з 64 додано та 19 видалено
  1. 11 3
      config.py
  2. 53 16
      waybackproxy.py

+ 11 - 3
config.py

@@ -12,13 +12,21 @@ DATE_TOLERANCE = 365
 # Send Geocities requests to oocities.org if set to True.
 GEOCITIES_FIX = True
 
-# Use the Wayback-tampered URL as a shortcut when loading images.
-# May result in faster loads, but all images will point to 
-# http://web.archive.org/... as a result. Set this value to 2 to enable an
+# Use the original Wayback Machine URL as a shortcut when loading images.
+# May result in faster page loads, but all images will point to
+# http://web.archive.org/... as a side effect. Set this value to 2 to enable an
 # experimental mode using authentication on top of the original URLs instead
 # (which is not supported by Internet Explorer and some other browsers).
 QUICK_IMAGES = True
 
+# Use the Wayback Machine Availability API to find the closest available
+# snapshot to the desired date, instead of directly requesting that date. Helps
+# in situations where an image returns a server error on the desired date, but
+# is available at an earlier date. As a side effect, pages will take longer to
+# load due to the added API call. This option as no effect when QUICK_IMAGES is
+# used alongside the PAC file.
+WAYBACK_API = True
+
 # Allow the Content-Type header to contain an encoding. Some old browsers
 # (Mosaic?) don't understand that and fail to load anything - set this to
 # False if you're using one of them.

+ 53 - 16
waybackproxy.py

@@ -1,10 +1,13 @@
 #!/usr/bin/env python3
-import base64, datetime, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
+import base64, datetime, json, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
 from config import *
 
 # internal LRU dictionary for preserving URLs on redirect
 date_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024)
 
+# internal LRU dictionary for date availability
+availability_cache = lrudict.LRUDict(maxduration=86400, maxsize=1024) if WAYBACK_API else None
+
 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
 	"""TCPServer with ThreadingMixIn added."""
 	pass
@@ -13,7 +16,7 @@ class Handler(socketserver.BaseRequestHandler):
 	"""Main request handler."""
 	def handle(self):
 		"""Handle a request."""
-		global DATE
+		global availability_cache
 		
 		# readline is pretty convenient
 		f = self.request.makefile()
@@ -77,6 +80,7 @@ class Handler(socketserver.BaseRequestHandler):
 		if auth:
 			effective_date = auth.replace(':', '')
 
+		# effectively handle the request
 		try:
 			if path in pac_file_paths:
 				# PAC file to bypass QUICK_IMAGES requests
@@ -105,14 +109,6 @@ class Handler(socketserver.BaseRequestHandler):
 					# required for QUICK_IMAGES
 					archived_url = '/'.join(request_url.split('/')[5:])
 					_print('[>] [QI] {0}'.format(archived_url))
-					try:
-						conn = urllib.request.urlopen(request_url)
-					except urllib.error.HTTPError as e:
-						if e.code == 404:
-							# Try this file on another date, might be redundant
-							return self.redirect_page(http_version, archived_url)
-						else:
-							raise e
 			elif GEOCITIES_FIX and hostname == 'www.geocities.com':
 				# apply GEOCITIES_FIX and pass it through
 				_print('[>] {0}'.format(archived_url))
@@ -120,15 +116,55 @@ class Handler(socketserver.BaseRequestHandler):
 				split = archived_url.split('/')
 				hostname = split[2] = 'www.oocities.org'
 				request_url = '/'.join(split)
-				
-				conn = urllib.request.urlopen(request_url)
 			else:
 				# get from Wayback
 				_print('[>] {0}'.format(archived_url))
 
-				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
+				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)				
+
+			if availability_cache is not None:
+				# are we requesting from Wayback?
+				split = request_url.split('/')
+
+				# if so, get the closest available date from Wayback's API, to avoid archived 404 pages and other site errors
+				if split[2] == 'web.archive.org':
+					# remove extraneous :80 from URL
+					if ':' in split[5]:
+						if split[7][-3:] == ':80':
+							split[7] = split[7][:-3]
+					elif split[5][-3:] == ':80':
+						split[5] = split[5][:-3]
+
+					# check availability LRU cache
+					availability_url = '/'.join(split[5:])
+					new_url = availability_cache.get(availability_url, None)
+					if new_url:
+						# in cache => replace URL immediately
+						request_url = new_url
+					else:
+						# not in cache => contact API
+						try:
+							availability = json.loads(urllib.request.urlopen('https://archive.org/wayback/available?url=' + urllib.parse.quote_plus(availability_url) + '&timestamp=' + effective_date[:14], timeout=10).read())
+							closest = availability.get('archived_snapshots', {}).get('closest', {})
+							new_date = closest.get('timestamp', None)
+						except:
+							_print('[!] Failed to fetch Wayback availability data')
+							new_date = None
+
+						if new_date and new_date != effective_date[:14]:
+							# returned date is different
+							new_url = closest['url']
+
+							# add asset tag if one is present in the original URL
+							if len(effective_date) > 14:
+								split = new_url.split('/')
+								split[4] += effective_date[14:]
+								new_url = '/'.join(split)
+
+							# replace URL and add it to the availability cache
+							request_url = availability[availability_url] = new_url
 
-				conn = urllib.request.urlopen(request_url)
+			conn = urllib.request.urlopen(request_url)
 		except urllib.error.HTTPError as e:
 			# an error has been found
 
@@ -138,7 +174,6 @@ class Handler(socketserver.BaseRequestHandler):
 				if not match:
 					match = re.search('''(?:\?|&)(?:[^=]+)=((?:http(?:%3A|:)(?:%2F|/)|www(?:[0-9]+)?\.(?:[^/%]+))?(?:%2F|/)[^&]+)''', archived_url, re.I)
 				if match:
-					print(match.groups())
 					# we found it
 					new_url = urllib.parse.unquote_plus(match.group(1))
 					# add protocol if the URL is absolute but missing a protocol
@@ -269,7 +304,8 @@ class Handler(socketserver.BaseRequestHandler):
 						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
 					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
 				else:
-					#data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
+					# Remove asset URLs while simultaneously adding them to the
+					# LRU cache with their respective date.
 					def add_to_date_cache(match):
 						orig_url = match.group(2)
 						date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = match.group(1).decode('ascii', 'ignore')
@@ -397,6 +433,7 @@ class Handler(socketserver.BaseRequestHandler):
 			if 'date' in parsed and DATE != parsed['date'][0]:
 				DATE = parsed['date'][0]
 				date_cache.clear()
+				availability_cache.clear()
 			if 'dateTolerance' in parsed and DATE_TOLERANCE != parsed['dateTolerance'][0]:
 				DATE_TOLERANCE = parsed['dateTolerance'][0]
 			GEOCITIES_FIX = 'gcFix' in parsed