|
@@ -229,7 +229,8 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
else:
|
|
else:
|
|
|
mode = 0
|
|
mode = 0
|
|
|
|
|
|
|
|
- # Wayback will add its HTML to anything it thinks is HTML
|
|
|
|
|
|
|
+ # Check content type to determine if this is HTML we need to patch.
|
|
|
|
|
+ # Wayback will add its HTML to anything it thinks is HTML.
|
|
|
guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type')
|
|
guessed_content_type = conn.info().get('X-Archive-Guessed-Content-Type')
|
|
|
if not guessed_content_type:
|
|
if not guessed_content_type:
|
|
|
guessed_content_type = content_type
|
|
guessed_content_type = content_type
|
|
@@ -244,7 +245,7 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
_print('[r] [QI]', archived_url)
|
|
_print('[r] [QI]', archived_url)
|
|
|
return self.send_redirect_page(http_version, archived_url, 301)
|
|
return self.send_redirect_page(http_version, archived_url, 301)
|
|
|
|
|
|
|
|
- # check if the date is within tolerance
|
|
|
|
|
|
|
+ # Check if the date is within tolerance.
|
|
|
if DATE_TOLERANCE is not None:
|
|
if DATE_TOLERANCE is not None:
|
|
|
match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl())
|
|
match = re.search('''//web\\.archive\\.org/web/([0-9]+)''', conn.geturl())
|
|
|
if match:
|
|
if match:
|
|
@@ -254,10 +255,10 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
conn.close()
|
|
conn.close()
|
|
|
return self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
|
|
return self.send_error_page(http_version, 412, 'Snapshot ' + requested_date + ' not available')
|
|
|
|
|
|
|
|
- # consume all data
|
|
|
|
|
|
|
+ # Consume all data.
|
|
|
data = conn.read()
|
|
data = conn.read()
|
|
|
|
|
|
|
|
- # patch the page
|
|
|
|
|
|
|
+ # Patch the page.
|
|
|
if mode == 0: # wayback
|
|
if mode == 0: # wayback
|
|
|
if b'<title>Wayback Machine</title>' in data:
|
|
if b'<title>Wayback Machine</title>' in data:
|
|
|
if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data: # exclusion error (robots.txt?)
|
|
if b'<p>This URL has been excluded from the Wayback Machine.</p>' in data: # exclusion error (robots.txt?)
|
|
@@ -284,15 +285,29 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
else:
|
|
else:
|
|
|
return self.send_error_page(http_version, e.code, e.reason)
|
|
return self.send_error_page(http_version, e.code, e.reason)
|
|
|
|
|
|
|
|
|
|
+ # Identify content type so we don't modify non-HTML content.
|
|
|
content_type = conn.info().get('Content-Type')
|
|
content_type = conn.info().get('Content-Type')
|
|
|
- if not CONTENT_TYPE_ENCODING and content_type.find(';') > -1:
|
|
|
|
|
- content_type = content_type[:content_type.find(';')]
|
|
|
|
|
- data = conn.read()
|
|
|
|
|
|
|
+ if not CONTENT_TYPE_ENCODING:
|
|
|
|
|
+ idx = content_type.find(';')
|
|
|
|
|
+ if idx > -1:
|
|
|
|
|
+ content_type = content_type[:idx]
|
|
|
|
|
+ if 'text/html' in content_type:
|
|
|
|
|
+ # Consume all data and proceed with patching the page.
|
|
|
|
|
+ data = conn.read()
|
|
|
|
|
+ else:
|
|
|
|
|
+ # Pass non-HTML data through.
|
|
|
|
|
+ self.send_response_headers(conn, http_version, content_type, request_url)
|
|
|
|
|
+ while True:
|
|
|
|
|
+ data = conn.read(1024)
|
|
|
|
|
+ if not data: break
|
|
|
|
|
+ self.request.sendall(data)
|
|
|
|
|
+ self.request.close()
|
|
|
|
|
+ return
|
|
|
|
|
|
|
|
if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
|
|
if b'<title></title>' in data and b'<span class="label style-scope media-button"><!---->Wayback Machine<!----></span>' in data:
|
|
|
match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
|
|
match = re.search(b'''<p class="impatient"><a href="(?:(?:https?:)?//web\\.archive\\.org)?/web/([^/]+)/([^"]+)">Impatient\\?</a></p>''', data)
|
|
|
if match:
|
|
if match:
|
|
|
- # wayback redirect page, follow it
|
|
|
|
|
|
|
+ # This is a Wayback redirect page, follow the redirect.
|
|
|
match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
|
|
match2 = re.search(b'<p class="code shift red">Got an HTTP ([0-9]+)', data)
|
|
|
try:
|
|
try:
|
|
|
redirect_code = int(match2.group(1))
|
|
redirect_code = int(match2.group(1))
|
|
@@ -303,18 +318,18 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
_print('[r]', archived_url)
|
|
_print('[r]', archived_url)
|
|
|
return self.send_redirect_page(http_version, archived_url, redirect_code)
|
|
return self.send_redirect_page(http_version, archived_url, redirect_code)
|
|
|
|
|
|
|
|
- # pre-toolbar scripts and CSS
|
|
|
|
|
|
|
+ # Remove pre-toolbar scripts and CSS.
|
|
|
data = re.sub(b'''<script src="//archive\\.org/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<script src="//archive\\.org/.*<!-- End Wayback Rewrite JS Include -->\\r?\\n''', b'', data, flags=re.S)
|
|
|
- # toolbar
|
|
|
|
|
|
|
+ # Remove toolbar.
|
|
|
data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->''', b'', data, flags=re.S)
|
|
|
- # comments on footer
|
|
|
|
|
|
|
+ # Remove comments on footer.
|
|
|
data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<!--\\r?\\n FILE ARCHIVED .*$''', b'', data, flags=re.S)
|
|
|
- # fix base tag
|
|
|
|
|
|
|
+ # Fix base tag.
|
|
|
data = re.sub(b'''(<base (?:[^>]*)href=(?:["\'])?)(?:(?:https?:)?//web.archive.org)?/web/(?:[^/]+)/''', b'\\1', data, flags=re.I + re.S)
|
|
data = re.sub(b'''(<base (?:[^>]*)href=(?:["\'])?)(?:(?:https?:)?//web.archive.org)?/web/(?:[^/]+)/''', b'\\1', data, flags=re.I + re.S)
|
|
|
|
|
|
|
|
- # remove extraneous :80 from links
|
|
|
|
|
|
|
+ # Remove extraneous :80 from links.
|
|
|
data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
|
|
data = re.sub(b'((?:(?:https?:)?//web.archive.org)?/web/)([^/]+)/([^:]+)://([^:]+):80/', b'\\1\\2/\\3://\\4/', data)
|
|
|
- # fix links
|
|
|
|
|
|
|
+ # Fix links.
|
|
|
if QUICK_IMAGES:
|
|
if QUICK_IMAGES:
|
|
|
# QUICK_IMAGES works by intercepting asset URLs (those
|
|
# QUICK_IMAGES works by intercepting asset URLs (those
|
|
|
# with a date code ending in im_, js_...) and letting the
|
|
# with a date code ending in im_, js_...) and letting the
|
|
@@ -338,23 +353,24 @@ class Handler(socketserver.BaseRequestHandler):
|
|
|
return orig_url
|
|
return orig_url
|
|
|
data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
|
|
data = re.sub(b'''(?:(?:https?:)?//web.archive.org)?/web/([^/]+)/([^"\\'#<>]+)''', add_to_date_cache, data)
|
|
|
elif mode == 1: # oocities
|
|
elif mode == 1: # oocities
|
|
|
- # viewport/cache-control/max-width code (header)
|
|
|
|
|
|
|
+ # Remove viewport/cache-control/max-width code from the header.
|
|
|
data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
|
|
data = re.sub(b'''^.*?\n\n''', b'', data, flags=re.S)
|
|
|
- # archive notice and tracking code (footer)
|
|
|
|
|
|
|
+ # Remove archive notice and tracking code from the footer.
|
|
|
data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<style> \n.zoomout { -webkit-transition: .*$''', b'', data, flags=re.S)
|
|
|
- # clearly labeled snippets from Geocities
|
|
|
|
|
|
|
+ # Remove clearly labeled snippets from Geocities.
|
|
|
data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
|
|
data = re.sub(b'''^.*<\\!-- text above generated by server\\. PLEASE REMOVE -->''', b'', data, flags=re.S)
|
|
|
data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<\\!-- following code added by server\\. PLEASE REMOVE -->.*<\!-- preceding code added by server\. PLEASE REMOVE -->''', b'', data, flags=re.S)
|
|
|
data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
|
|
data = re.sub(b'''<\\!-- text below generated by server\\. PLEASE REMOVE -->.*$''', b'', data, flags=re.S)
|
|
|
|
|
|
|
|
- # fix links
|
|
|
|
|
|
|
+ # Fix links.
|
|
|
data = re.sub(b'''//([^.]*)\\.oocities\\.com/''', b'//\\1.geocities.com/', data, flags=re.S)
|
|
data = re.sub(b'''//([^.]*)\\.oocities\\.com/''', b'//\\1.geocities.com/', data, flags=re.S)
|
|
|
|
|
|
|
|
|
|
+ # Send patched page.
|
|
|
self.send_response_headers(conn, http_version, content_type, request_url)
|
|
self.send_response_headers(conn, http_version, content_type, request_url)
|
|
|
self.request.sendall(data)
|
|
self.request.sendall(data)
|
|
|
- else: # other data
|
|
|
|
|
|
|
+ else:
|
|
|
|
|
+ # Pass non-HTML data through.
|
|
|
self.send_response_headers(conn, http_version, content_type, request_url)
|
|
self.send_response_headers(conn, http_version, content_type, request_url)
|
|
|
-
|
|
|
|
|
while True:
|
|
while True:
|
|
|
data = conn.read(1024)
|
|
data = conn.read(1024)
|
|
|
if not data: break
|
|
if not data: break
|