Просмотр исходного кода

Add LRU cache for URL-date mappings, helps deal with 404 redirects as well as QUICK_IMAGES = False

RichardG867 5 лет назад
Родитель
Сommit
690626ebbb
3 измененных файлов с 95 добавлено и 6 удалено
  1. 0 1
      README.md
  2. 76 0
      lrudict.py
  3. 19 5
      waybackproxy.py

+ 0 - 1
README.md

@@ -17,5 +17,4 @@ WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archi
 ## Known issues and limitations
 
 * The Wayback Machine itself is not 100% reliable, especially when it comes to images on archived pages.
-* Dates are not preserved on redirect, which can lead to 404 errors on a few websites.
 * WaybackProxy is not a generic proxy. The POST and CONNECT methods are not implemented.

+ 76 - 0
lrudict.py

@@ -0,0 +1,76 @@
+import collections, time
+
+# http://code.activestate.com/recipes/580644-lru-dictionary/
+
+class LRUDict(collections.OrderedDict):
+	'''An dict that can discard least-recently-used items, either by maximum capacity
+	or by time to live.
+	An item's ttl is refreshed (aka the item is considered "used") by direct access
+	via [] or get() only, not via iterating over the whole collection with items()
+	for example.
+	Expired entries only get purged after insertions or changes. Either call purge()
+	manually or check an item's ttl with ttl() if that's unacceptable.
+	'''
+	def __init__(self, *args, maxduration=None, maxsize=128, **kwargs):
+		'''Same arguments as OrderedDict with these 2 additions:
+		maxduration: number of seconds entries are kept. 0 or None means no timelimit.
+		maxsize: maximum number of entries being kept.'''
+		super().__init__(*args, **kwargs)
+		self.maxduration = maxduration
+		self.maxsize = maxsize
+		self.purge()
+
+	def purge(self):
+		"""Removes expired or overflowing entries."""
+		if self.maxsize:
+			# pop until maximum capacity is reached
+			overflowing = max(0, len(self) - self.maxsize)
+			for _ in range(overflowing):
+				self.popitem(last=False)
+		if self.maxduration:
+			# expiration limit
+			limit = time.time() - self.maxduration
+			# as long as there are still items in the dictionary
+			while self:
+				# look at the oldest (front)
+				_, lru = next(iter(super().values()))
+				# if it is within the timelimit, we're fine
+				if lru > limit:
+					break
+				# otherwise continue to pop the front
+				self.popitem(last=False)
+
+	def __getitem__(self, key):
+		# retrieve item
+		value = super().__getitem__(key)[0]
+		# update lru time
+		super().__setitem__(key, (value, time.time()))
+		self.move_to_end(key)
+		return value
+
+	def get(self, key, default=None):
+		try:
+			return self[key]
+		except KeyError:
+			return default
+
+	def ttl(self, key):
+		'''Returns the number of seconds this item will live.
+		The item might still be deleted if maxsize is reached.
+		The time to live can be negative, as for expired items
+		that have not been purged yet.'''
+		if self.maxduration:
+			lru = super().__getitem__(key)[1]
+			return self.maxduration - (time.time() - lru)
+
+	def __setitem__(self, key, value):
+		super().__setitem__(key, (value, time.time()))
+		self.purge()
+		
+	def items(self):
+		# remove ttl from values
+		return ((k, v) for k, (v, _) in super().items())
+	
+	def values(self):
+		# remove ttl from values
+		return (v for v, _ in super().values())

+ 19 - 5
waybackproxy.py

@@ -1,7 +1,10 @@
 #!/usr/bin/env python
-import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
+import base64, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
 from config import *
 
+# internal LRU dictionary for preserving URLs on redirect
+date_cache = lrudict.LRUDict(maxduration=60, maxsize=1024)
+
 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
 	"""TCPServer with ThreadingMixIn added."""
 	pass
@@ -54,7 +57,7 @@ class Handler(socketserver.BaseRequestHandler):
 			elif ll[:21] == 'authorization: basic ':
 				# asset date code passed as username:password
 				auth = base64.b64decode(ll[21:])
-		
+
 		try:
 			if path in ('/proxy.pac', '/wpad.dat', '/wpad.da'):
 				# PAC file to bypass QUICK_IMAGES requests
@@ -107,6 +110,9 @@ class Handler(socketserver.BaseRequestHandler):
 				# get from Wayback
 				_print('[>] {0}'.format(archived_url))
 
+				# get cached date for redirects
+				effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
+
 				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
 
 				conn = urllib.request.urlopen(request_url)
@@ -173,7 +179,7 @@ class Handler(socketserver.BaseRequestHandler):
 						data = conn.read()
 
 				if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
-					match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/(?:[^/]+)/([^"]+)">Impatient\?</a></p>', data)
+					match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/([^/]+)/([^"]+)">Impatient\?</a></p>', data)
 					if match:
 						# wayback redirect page, follow it
 						match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
@@ -181,7 +187,8 @@ class Handler(socketserver.BaseRequestHandler):
 							redirect_code = int(match2.group(1))
 						except:
 							redirect_code = 302
-						archived_url = match.group(1).decode('ascii', 'ignore')
+						archived_url = match.group(2).decode('ascii', 'ignore')
+						date_cache[effective_date + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
 						print('[r]', archived_url)
 						return self.redirect_page(http_version, archived_url, redirect_code)
 
@@ -212,7 +219,14 @@ class Handler(socketserver.BaseRequestHandler):
 						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
 					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
 				else:
-					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
+					#data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
+					def add_to_date_cache(match):
+						orig_url = match.group(2)
+						new_date = match.group(1)
+						if len(new_date) > 14: # only cache asset URLs
+							date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = new_date.decode('ascii', 'ignore')
+						return orig_url
+					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/([^"\'#<>]+)', add_to_date_cache, data)
 			elif mode == 1: # oocities
 				# viewport/cache-control/max-width code (header)
 				data = re.sub(b'^(?:.*?)\n\n', b'', data, flags=re.S)