Explorar o código

Dockerize and bug fixes

Created Dockerfile and startup.sh to enable building in a docker container.

Small bug fixes to handle dynamically building the config.py and handle some urlopen errors more gracefully.

Switched to using datetime.datetime.strptime to parse wayback machine dates
Jon Marler %!s(int64=3) %!d(string=hai) anos
pai
achega
02bd6983b1
Modificáronse 5 ficheiros con 94 adicións e 58 borrados
  1. 3 0
      .gitignore
  2. 30 0
      Dockerfile
  3. 33 0
      README.md
  4. 16 0
      startup.sh
  5. 12 58
      waybackproxy.py

+ 3 - 0
.gitignore

@@ -55,3 +55,6 @@ docs/_build/
 
 # PyBuilder
 target/
+
+# Temp files
+*.swp

+ 30 - 0
Dockerfile

@@ -0,0 +1,30 @@
+# Dockerfile
+#
+# Project: WaybackProxy
+# License: GNU GPLv3
+#
+
+FROM python:3
+
+MAINTAINER richardg867
+LABEL description = "HTTP Proxy for tunneling requests through the Internet Archive Wayback Machine"
+
+# Setup config.py
+ENV LISTEN_PORT=8888
+ENV DATE='20011025'
+ENV DATE_TOLERANCE=365
+ENV GEOCITIES_FIX=True
+ENV QUICK_IMAGES=True
+ENV WAYBACK_API=True
+ENV CONTENT_TYPE_ENCODING=True
+ENV SILENT=False
+ENV SETTINGS_PAGE=True
+
+ADD startup.sh /
+ADD lrudict.py /
+ADD waybackproxy.py /
+
+EXPOSE 8080
+
+CMD [ "sh" , "/startup.sh" ]
+

+ 33 - 0
README.md

@@ -28,6 +28,39 @@ WaybackProxy is a retro-friendly HTTP proxy which retrieves pages from the [Inte
 * WaybackProxy is not a generic proxy. The POST and CONNECT methods are not implemented.
 * Transparent proxying mode requires HTTP/1.1 and therefore cannot be used with some really old (pre-1996) browsers. Use standard mode with such browsers.
 
+## Docker Container
+
+A Dockerfile is included that allows you to run WaybackProxy from a docker container. 
+
+### Environment Variables
+
+When deploying via Docker, the config.py script can be customized by specifying environment variables when creating the docker container. The environment variables match the example config.py script in this repository. Below is a complete list:
+
+| Parameter        | Default | Description                            |
+|------------------|----------------------------------------|
+| `LISTEN_PORT` | 8888 | Listen port for the HTTP proxy |
+| `DATE` | 20011025 | Date to get pages from Wayback. YYYYMMDD, YYYYMM and YYYY formats are accepted, the more specific the better.|
+| `DATE_TOLERANCE` | 365 | Allow the client to load pages and assets up to X days after DATE. Set to None to disable this restriction.|
+| `GEOCITIES_FIX` | True | Send Geocities requests to oocities.org if set to True. |
+| `QUICK_IMAGES` | True | Use the original Wayback Machine URL as a shortcut when loading images. |
+| `WAYBACK_API` | True | Use the Wayback Machine Availability API to find the closest available snapshot to the desired date, instead of directly requesting that date.|
+| `CONTENT_TYPE_ENCODING` | True | Allow the Content-Type header to contain an encoding |
+| `SILENT` | True | Disables logging to STDOUT if set to True |
+| `SETTINGS_PAGE` | True | Enables the settings page on http://web.archive.org if set to True |
+
+### Example docker commands
+
+To build:
+
+```bash
+docker build --no-cache -t waybackproxy .
+```
+To run:
+
+```bash
+docker run --rm -it -e DATE=20011225 -p 8888:8888 waybackproxy
+```
+
 ## Other links
 
 * [Donate to the Internet Archive](https://archive.org/donate/), they need your help to keep the Wayback Machine and its petabytes upon petabytes of data available to everyone for free with no ads.

+ 16 - 0
startup.sh

@@ -0,0 +1,16 @@
+#!/bin/sh
+
+echo LISTEN_PORT=$LISTEN_PORT > /config.py
+echo DATE=$DATE >> /config.py
+echo DATE_TOLERANCE=$DATE_TOLERANCE >> /config.py
+echo GEOCITIES_FIX=$GEOCITIES_FIX  >> /config.py
+echo QUICK_IMAGES=$QUICK_IMAGES  >> /config.py
+echo WAYBACK_API=$WAYBACK_API  >> /config.py
+echo CONTENT_TYPE_ENCODING=$CONTENT_TYPE_ENCODING  >> /config.py
+echo SILENT=$SILENT  >> /config.py
+echo SETTINGS_PAGE=$SETTINGS_PAGE  >> /config.py
+
+echo config.py:
+cat /config.py
+
+python /waybackproxy.py

+ 12 - 58
waybackproxy.py

@@ -74,7 +74,7 @@ class Handler(socketserver.BaseRequestHandler):
 
 		# get cached date for redirects, if available
 		original_date = effective_date
-		effective_date = date_cache.get(effective_date + '\x00' + archived_url, effective_date)
+		effective_date = date_cache.get(str(effective_date) + '\x00' + archived_url, str(effective_date))
 
 		# get date from username:password, if available
 		if auth:
@@ -196,6 +196,10 @@ class Handler(socketserver.BaseRequestHandler):
 				conn = e
 			else:
 				return self.error_page(http_version, e.code, e.reason)
+		except socket.timeout as e:
+			_print('Timeout')
+		except:
+			_print('Generic exception')
 		
 		# get content type
 		content_type = conn.info().get('Content-Type')
@@ -226,7 +230,7 @@ class Handler(socketserver.BaseRequestHandler):
 				match = re.search('''//web\.archive\.org/web/([0-9]+)''', conn.geturl())
 				if match:
 					requested_date = match.group(1)
-					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(DATE_TOLERANCE):
+					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(int(DATE_TOLERANCE)):
 						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
 						conn.close()
 						return self.error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
@@ -430,7 +434,7 @@ class Handler(socketserver.BaseRequestHandler):
 	def handle_settings(self, query):
 		"""Generate the settings page."""
 	
-		global DATE, GEOCITIES_FIX, QUICK_IMAGES, CONTENT_TYPE_ENCODING
+		global DATE, DATE_TOLERANCE, GEOCITIES_FIX, QUICK_IMAGES, WAYBACK_API, CONTENT_TYPE_ENCODING, SILENT, SETTINGS_PAGE
 		
 		if query != '': # handle any parameters that may have been sent
 			parsed = urllib.parse.parse_qs(query)
@@ -451,9 +455,9 @@ class Handler(socketserver.BaseRequestHandler):
 		settingspage += self.signature()
 		settingspage += '</b></p><form method="get" action="/">'
 		settingspage += '<p>Date to get pages from: <input type="text" name="date" size="8" value="'
-		settingspage += DATE
+		settingspage += str(DATE)
 		settingspage += '"><p>Date tolerance: <input type="text" name="dateTolerance" size="8" value="'
-		settingspage += DATE_TOLERANCE
+		settingspage += str(DATE_TOLERANCE)
 		settingspage += '"> days<br><input type="checkbox" name="gcFix"'
 		if GEOCITIES_FIX: settingspage += ' checked'
 		settingspage += '> Geocities Fix<br><input type="checkbox" name="quickImages"'
@@ -471,60 +475,10 @@ class Handler(socketserver.BaseRequestHandler):
 	def wayback_to_datetime(self, date):
 		"""Convert a Wayback format date string to a datetime.datetime object."""
 
-		# parse the string
-		year = 1995
-		month = 12
-		day = 31
-		hour = 0
-		minute = 0
-		second = 0
-		if len(date) > 0:
-			year = int(date[:4])
-		if len(date) > 4:
-			month = int(date[4:6])
-		if len(date) > 6:
-			day = int(date[6:8])
-		if len(date) > 8:
-			hour = int(date[8:10])
-		if len(date) > 10:
-			minute = int(date[10:12])
-		if len(date) > 12:
-			second = int(date[12:14])
-
-		# sanitize the numbers
-		if month < 1:
-			month = 1
-		elif month > 12:
-			month = 12
-		if day < 1:
-			day = 1
-		elif day > 31:
-			day = 31
-		if hour > 23:
-			hour = 23
-		elif hour < 0:
-			hour = 0
-		if minute > 59:
-			minute = 59
-		elif minute < 0:
-			minute = 0
-		if second > 59:
-			second = 59
-		elif second < 0:
-			second = 0
-
-		# if the day is invalid for that month, work its way down
 		try:
-			dt = datetime.datetime(year, month, day, hour, minute, second) # max 31
+			dt = datetime.datetime.strptime(str(date), '%Y%m%d%H%M%S')
 		except:
-			try:
-				dt = datetime.datetime(year, month, day - 1, hour, minute, second) # max 30
-			except:
-				try:
-					dt = datetime.datetime(year, month, day - 2, hour, minute, second) # max 29
-				except:
-					dt = datetime.datetime(year, month, day - 3, hour, minute, second) # max 28
-
+			dt = datetime.datetime.strptime(str(date), '%Y%m%d')
 		return dt
 
 print_lock = threading.Lock()
@@ -547,4 +501,4 @@ def main():
 		pass
 
 if __name__ == '__main__':
-	main()
+	main()