5 лет назад · 690626ebbb
--- a/README.md
+++ b/README.md
@@ -17,5 +17,4 @@ WaybackProxy is a HTTP proxy that sends all requests through the [Internet Archi
 
				 ## Known issues and limitations
			
 
				 
			
 
				 * The Wayback Machine itself is not 100% reliable, especially when it comes to images on archived pages.
			
 
				-* Dates are not preserved on redirect, which can lead to 404 errors on a few websites.
			
 
				 * WaybackProxy is not a generic proxy. The POST and CONNECT methods are not implemented.
			
--- a/lrudict.py
+++ b/lrudict.py
@@ -0,0 +1,76 @@
 
				+import collections, time
			
 
				+
			
 
				+# http://code.activestate.com/recipes/580644-lru-dictionary/
			
 
				+
			
 
				+class LRUDict(collections.OrderedDict):
			
 
				+	'''An dict that can discard least-recently-used items, either by maximum capacity
			
 
				+	or by time to live.
			
 
				+	An item's ttl is refreshed (aka the item is considered "used") by direct access
			
 
				+	via [] or get() only, not via iterating over the whole collection with items()
			
 
				+	for example.
			
 
				+	Expired entries only get purged after insertions or changes. Either call purge()
			
 
				+	manually or check an item's ttl with ttl() if that's unacceptable.
			
 
				+	'''
			
 
				+	def __init__(self, *args, maxduration=None, maxsize=128, **kwargs):
			
 
				+		'''Same arguments as OrderedDict with these 2 additions:
			
 
				+		maxduration: number of seconds entries are kept. 0 or None means no timelimit.
			
 
				+		maxsize: maximum number of entries being kept.'''
			
 
				+		super().__init__(*args, **kwargs)
			
 
				+		self.maxduration = maxduration
			
 
				+		self.maxsize = maxsize
			
 
				+		self.purge()
			
 
				+
			
 
				+	def purge(self):
			
 
				+		"""Removes expired or overflowing entries."""
			
 
				+		if self.maxsize:
			
 
				+			# pop until maximum capacity is reached
			
 
				+			overflowing = max(0, len(self) - self.maxsize)
			
 
				+			for _ in range(overflowing):
			
 
				+				self.popitem(last=False)
			
 
				+		if self.maxduration:
			
 
				+			# expiration limit
			
 
				+			limit = time.time() - self.maxduration
			
 
				+			# as long as there are still items in the dictionary
			
 
				+			while self:
			
 
				+				# look at the oldest (front)
			
 
				+				_, lru = next(iter(super().values()))
			
 
				+				# if it is within the timelimit, we're fine
			
 
				+				if lru > limit:
			
 
				+					break
			
 
				+				# otherwise continue to pop the front
			
 
				+				self.popitem(last=False)
			
 
				+
			
 
				+	def __getitem__(self, key):
			
 
				+		# retrieve item
			
 
				+		value = super().__getitem__(key)[0]
			
 
				+		# update lru time
			
 
				+		super().__setitem__(key, (value, time.time()))
			
 
				+		self.move_to_end(key)
			
 
				+		return value
			
 
				+
			
 
				+	def get(self, key, default=None):
			
 
				+		try:
			
 
				+			return self[key]
			
 
				+		except KeyError:
			
 
				+			return default
			
 
				+
			
 
				+	def ttl(self, key):
			
 
				+		'''Returns the number of seconds this item will live.
			
 
				+		The item might still be deleted if maxsize is reached.
			
 
				+		The time to live can be negative, as for expired items
			
 
				+		that have not been purged yet.'''
			
 
				+		if self.maxduration:
			
 
				+			lru = super().__getitem__(key)[1]
			
 
				+			return self.maxduration - (time.time() - lru)
			
 
				+
			
 
				+	def __setitem__(self, key, value):
			
 
				+		super().__setitem__(key, (value, time.time()))
			
 
				+		self.purge()
			
 
				+		
			
 
				+	def items(self):
			
 
				+		# remove ttl from values
			
 
				+		return ((k, v) for k, (v, _) in super().items())
			
 
				+	
			
 
				+	def values(self):
			
 
				+		# remove ttl from values
			
 
				+		return (v for v, _ in super().values())
			
--- a/waybackproxy.py
+++ b/waybackproxy.py
@@ -1,7 +1,10 @@
 
				 #!/usr/bin/env python
			
 
				-import base64, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
			
 
				+import base64, lrudict, re, socket, socketserver, sys, threading, urllib.request, urllib.error, urllib.parse
			
 
				 from config import *
			
 
				 
			
 
				+# internal LRU dictionary for preserving URLs on redirect
			
 
				+date_cache = lrudict.LRUDict(maxduration=60, maxsize=1024)
			
 
				+
			
 
				 class ThreadingTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
			
 
				 	"""TCPServer with ThreadingMixIn added."""
			
 
				 	pass
			
@@ -54,7 +57,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 			elif ll[:21] == 'authorization: basic ':
			
 
				 				# asset date code passed as username:password
			
 
				 				auth = base64.b64decode(ll[21:])
			
 
				-		
			
 
				+
			
 
				 		try:
			
 
				 			if path in ('/proxy.pac', '/wpad.dat', '/wpad.da'):
			
 
				 				# PAC file to bypass QUICK_IMAGES requests
			
@@ -107,6 +110,9 @@ class Handler(socketserver.BaseRequestHandler):
 
				 				# get from Wayback
			
 
				 				_print('[>] {0}'.format(archived_url))
			
 
				 
			
 
				+				# get cached date for redirects
			
 
				+				effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
			
 
				+
			
 
				 				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
			
 
				 
			
 
				 				conn = urllib.request.urlopen(request_url)
			
@@ -173,7 +179,7 @@ class Handler(socketserver.BaseRequestHandler):
 
				 						data = conn.read()
			
 
				 
			
 
				 				if b'<title></title>' in data and b'<h1><span>Internet Archive\'s Wayback Machine</span></h1>' in data:
			
 
				-					match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/(?:[^/]+)/([^"]+)">Impatient\?</a></p>', data)
			
 
				+					match = re.search(b'<p class="impatient"><a href="(?:(?:http(?:s)?:)?//web\.archive\.org)?/web/([^/]+)/([^"]+)">Impatient\?</a></p>', data)
			
 
				 					if match:
			
 
				 						# wayback redirect page, follow it
			
 
				 						match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
			
@@ -181,7 +187,8 @@ class Handler(socketserver.BaseRequestHandler):
 
				 							redirect_code = int(match2.group(1))
			
 
				 						except:
			
 
				 							redirect_code = 302
			
 
				-						archived_url = match.group(1).decode('ascii', 'ignore')
			
 
				+						archived_url = match.group(2).decode('ascii', 'ignore')
			
 
				+						date_cache[effective_date + '\x00' + archived_url] = match.group(1).decode('ascii', 'ignore')
			
 
				 						print('[r]', archived_url)
			
 
				 						return self.redirect_page(http_version, archived_url, redirect_code)
			
 
				 
			
@@ -212,7 +219,14 @@ class Handler(socketserver.BaseRequestHandler):
 
				 						QUICK_IMAGES == 2 and b'\\3://\\1:\\2@' or b'http://web.archive.org/web/\\1\\2/\\3://', data)
			
 
				 					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([0-9]+)/', b'', data)
			
 
				 				else:
			
 
				-					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
			
 
				+					#data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/', b'', data)
			
 
				+					def add_to_date_cache(match):
			
 
				+						orig_url = match.group(2)
			
 
				+						new_date = match.group(1)
			
 
				+						if len(new_date) > 14: # only cache asset URLs
			
 
				+							date_cache[effective_date + '\x00' + orig_url.decode('ascii', 'ignore')] = new_date.decode('ascii', 'ignore')
			
 
				+						return orig_url
			
 
				+					data = re.sub(b'(?:(?:http(?:s)?:)?//web.archive.org)?/web/([^/]+)/([^"\'#<>]+)', add_to_date_cache, data)
			
 
				 			elif mode == 1: # oocities
			
 
				 				# viewport/cache-control/max-width code (header)
			
 
				 				data = re.sub(b'^(?:.*?)\n\n', b'', data, flags=re.S)