Просмотр исходного кода

Only check date tolerance on HTML content

Fixes issues with asset loading in some sites, due to the assets being present at future dates outside of the tolerance.
RichardG867 5 лет назад
Родитель
Сommit
b78e717c5a
1 измененных файлов с 10 добавлено и 9 удалено
  1. 10 9
      waybackproxy.py

+ 10 - 9
waybackproxy.py

@@ -115,15 +115,6 @@ class Handler(socketserver.BaseRequestHandler):
 				request_url = 'http://web.archive.org/web/{0}/{1}'.format(effective_date, archived_url)
 
 				conn = urllib.request.urlopen(request_url)
-
-			# check if the date is within tolerance
-			if DATE_TOLERANCE is not None:
-				match = re.search('''//web\.archive\.org/web/([0-9]+)''', conn.geturl())
-				if match:
-					requested_date = match.group(1)
-					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(DATE_TOLERANCE):
-						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
-						raise urllib.error.HTTPError(conn.geturl(), 412, 'Snapshot ' + requested_date + ' not available', conn.info(), conn)
 		except urllib.error.HTTPError as e:
 			# an error has been found
 
@@ -170,6 +161,16 @@ class Handler(socketserver.BaseRequestHandler):
 				_print('[r] [QI]', archived_url)
 				return self.redirect_page(http_version, archived_url, 301)
 
+			# check if the date is within tolerance
+			if DATE_TOLERANCE is not None:
+				match = re.search('''//web\.archive\.org/web/([0-9]+)''', conn.geturl())
+				if match:
+					requested_date = match.group(1)
+					if self.wayback_to_datetime(requested_date) > self.wayback_to_datetime(original_date) + datetime.timedelta(DATE_TOLERANCE):
+						_print('[!]', requested_date, 'is outside the configured tolerance of', DATE_TOLERANCE, 'days')
+						conn.close()
+						return self.error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
+
 			# consume all data
 			data = conn.read()