Loading...
Loading...
Web page archiving and retrieval from cached/deleted sources. Use when accessing unavailable pages, preserving web content, creating legal evidence archives, or building redundant archival workflows. Covers Wayback Machine, Archive.today, ArchiveBox, and evidence preservation tools.
npx skill4agent add jamditis/claude-skills-journalism web-archiving┌─────────────────────────────────────────────────────────────────┐
│ ARCHIVE RETRIEVAL CASCADE │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Wayback Machine (archive.org) │
│ └─ 916B+ pages, historical depth, API access │
│ ↓ not found │
│ 2. Archive.today (archive.is/archive.ph) │
│ └─ On-demand snapshots, paywall bypass │
│ ↓ not found │
│ 3. Google Cache (limited availability) │
│ └─ Recent pages, search: cache:url │
│ ↓ not found │
│ 4. Bing Cache │
│ └─ Click dropdown arrow in search results │
│ ↓ not found │
│ 5. Memento Time Travel (aggregator) │
│ └─ Searches multiple archives simultaneously │
│ │
└─────────────────────────────────────────────────────────────────┘import requests
from typing import Optional
from datetime import datetime
def check_wayback_availability(url: str) -> Optional[dict]:
"""Check if URL exists in Wayback Machine."""
api_url = f"http://archive.org/wayback/available?url={url}"
try:
response = requests.get(api_url, timeout=10)
data = response.json()
if data.get('archived_snapshots', {}).get('closest'):
snapshot = data['archived_snapshots']['closest']
return {
'available': snapshot.get('available', False),
'url': snapshot.get('url'),
'timestamp': snapshot.get('timestamp'),
'status': snapshot.get('status')
}
return None
except Exception as e:
return None
def get_wayback_url(url: str, timestamp: str = None) -> str:
"""Generate Wayback Machine URL for a page.
Args:
url: Original URL to retrieve
timestamp: Optional YYYYMMDDHHMMSS format, or None for latest
"""
if timestamp:
return f"https://web.archive.org/web/{timestamp}/{url}"
return f"https://web.archive.org/web/{url}"def save_to_wayback(url: str) -> Optional[str]:
"""Request Wayback Machine to archive a URL.
Returns the archived URL if successful.
"""
save_url = f"https://web.archive.org/save/{url}"
headers = {
'User-Agent': 'Mozilla/5.0 (research-archiver)'
}
try:
response = requests.get(save_url, headers=headers, timeout=60)
# Check for successful archive
if response.status_code == 200:
# The archived URL is in the Content-Location header
archived_url = response.headers.get('Content-Location')
if archived_url:
return f"https://web.archive.org{archived_url}"
return response.url
return None
except Exception:
return Nonedef get_all_snapshots(url: str, limit: int = 100) -> list[dict]:
"""Get all archived snapshots of a URL using CDX API.
Returns list of snapshots with timestamps and status codes.
"""
cdx_url = "http://web.archive.org/cdx/search/cdx"
params = {
'url': url,
'output': 'json',
'limit': limit,
'fl': 'timestamp,original,statuscode,digest,length'
}
try:
response = requests.get(cdx_url, params=params, timeout=30)
data = response.json()
if len(data) < 2: # First row is headers
return []
headers = data[0]
snapshots = []
for row in data[1:]:
snapshot = dict(zip(headers, row))
snapshot['wayback_url'] = (
f"https://web.archive.org/web/{snapshot['timestamp']}/{snapshot['original']}"
)
snapshots.append(snapshot)
return snapshots
except Exception:
return []import requests
from urllib.parse import quote
def save_to_archive_today(url: str) -> Optional[str]:
"""Submit URL to Archive.today for archiving.
Note: Archive.today has rate limiting and CAPTCHA requirements.
This function works for basic archiving but may require
manual intervention for high-volume use.
"""
submit_url = "https://archive.today/submit/"
data = {
'url': url,
'anyway': '1' # Archive even if recent snapshot exists
}
try:
response = requests.post(submit_url, data=data, timeout=60)
# Archive.today returns the archived URL in the response
if response.status_code == 200:
return response.url
return None
except Exception:
return None
def search_archive_today(url: str) -> Optional[str]:
"""Search for existing Archive.today snapshot."""
search_url = f"https://archive.today/{quote(url, safe='')}"
try:
response = requests.get(search_url, timeout=30, allow_redirects=True)
if response.status_code == 200 and 'archive.today' in response.url:
return response.url
return None
except Exception:
return Nonefrom dataclasses import dataclass
from typing import Optional, List
from concurrent.futures import ThreadPoolExecutor, as_completed
@dataclass
class ArchiveResult:
service: str
url: str
archived_url: Optional[str]
success: bool
error: Optional[str] = None
class MultiArchiver:
"""Archive URLs to multiple services for redundancy."""
def __init__(self):
self.services = [
('wayback', self._save_wayback),
('archive_today', self._save_archive_today),
('perma_cc', self._save_perma), # Requires API key
]
def archive_url(self, url: str, parallel: bool = True) -> List[ArchiveResult]:
"""Archive URL to all services.
Args:
url: URL to archive
parallel: If True, archive to all services simultaneously
"""
results = []
if parallel:
with ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(save_func, url): name
for name, save_func in self.services
}
for future in as_completed(futures):
service = futures[future]
try:
archived_url = future.result()
results.append(ArchiveResult(
service=service,
url=url,
archived_url=archived_url,
success=archived_url is not None
))
except Exception as e:
results.append(ArchiveResult(
service=service,
url=url,
archived_url=None,
success=False,
error=str(e)
))
else:
for name, save_func in self.services:
try:
archived_url = save_func(url)
results.append(ArchiveResult(
service=name,
url=url,
archived_url=archived_url,
success=archived_url is not None
))
except Exception as e:
results.append(ArchiveResult(
service=name,
url=url,
archived_url=None,
success=False,
error=str(e)
))
return results
def _save_wayback(self, url: str) -> Optional[str]:
return save_to_wayback(url)
def _save_archive_today(self, url: str) -> Optional[str]:
return save_to_archive_today(url)
def _save_perma(self, url: str) -> Optional[str]:
# Requires Perma.cc API key
# Implementation depends on having API credentials
return None# Install ArchiveBox
pip install archivebox
# Or with Docker
docker pull archivebox/archivebox
# Initialize archive directory
mkdir ~/web-archives && cd ~/web-archives
archivebox init
# Add URLs to archive
archivebox add "https://example.com/article"
# Add multiple URLs from file
archivebox add --depth=0 < urls.txt
# Schedule regular archiving
archivebox schedule --every=day --depth=1 "https://example.com/feed.rss"import subprocess
from pathlib import Path
from typing import List, Optional
class ArchiveBoxManager:
"""Manage local ArchiveBox instance."""
def __init__(self, archive_dir: Path):
self.archive_dir = archive_dir
self._ensure_initialized()
def _ensure_initialized(self):
"""Initialize ArchiveBox if needed."""
if not (self.archive_dir / 'index.sqlite3').exists():
subprocess.run(
['archivebox', 'init'],
cwd=self.archive_dir,
check=True
)
def add_url(self, url: str, depth: int = 0) -> bool:
"""Archive a single URL.
Args:
url: URL to archive
depth: 0 for single page, 1 to follow links one level deep
"""
result = subprocess.run(
['archivebox', 'add', f'--depth={depth}', url],
cwd=self.archive_dir,
capture_output=True,
text=True
)
return result.returncode == 0
def add_urls_from_file(self, filepath: Path) -> bool:
"""Archive URLs from a text file (one per line)."""
with open(filepath) as f:
result = subprocess.run(
['archivebox', 'add', '--depth=0'],
cwd=self.archive_dir,
stdin=f,
capture_output=True
)
return result.returncode == 0
def search(self, query: str) -> List[dict]:
"""Search archived content."""
result = subprocess.run(
['archivebox', 'list', '--filter-type=search', query],
cwd=self.archive_dir,
capture_output=True,
text=True
)
# Parse output...
return []import hashlib
from datetime import datetime
from dataclasses import dataclass, asdict
import json
@dataclass
class EvidenceRecord:
"""Legally defensible evidence record."""
# Content identification
original_url: str
archived_urls: List[str] # Multiple archive copies
content_hash_sha256: str
# Timestamps
capture_time_utc: str
first_observed: str
# Metadata
page_title: str
captured_by: str
capture_method: str
tool_versions: dict
# Chain of custody
custody_log: List[dict] # Who accessed when
def add_custody_entry(self, accessor: str, action: str, notes: str = ""):
"""Log access to evidence."""
self.custody_log.append({
'timestamp': datetime.utcnow().isoformat(),
'accessor': accessor,
'action': action,
'notes': notes
})
def to_json(self) -> str:
return json.dumps(asdict(self), indent=2)
@classmethod
def from_capture(cls, url: str, content: bytes, captured_by: str):
"""Create evidence record from captured content."""
return cls(
original_url=url,
archived_urls=[],
content_hash_sha256=hashlib.sha256(content).hexdigest(),
capture_time_utc=datetime.utcnow().isoformat(),
first_observed=datetime.utcnow().isoformat(),
page_title="",
captured_by=captured_by,
capture_method="automated_capture",
tool_versions={
'archiver': '1.0.0',
'python': '3.11'
},
custody_log=[]
)
def capture_as_evidence(url: str, captured_by: str) -> EvidenceRecord:
"""Capture URL with full evidence chain documentation."""
# Capture content
response = requests.get(url)
content = response.content
# Create evidence record
record = EvidenceRecord.from_capture(url, content, captured_by)
record.page_title = extract_title(content)
# Archive to multiple services
archiver = MultiArchiver()
results = archiver.archive_url(url)
for result in results:
if result.success:
record.archived_urls.append(result.archived_url)
# Log initial capture
record.add_custody_entry(
captured_by,
'initial_capture',
f'Captured from {url}, archived to {len(record.archived_urls)} services'
)
return recordimport requests
from typing import Optional
class PermaCC:
"""Perma.cc API client for legal-grade archiving.
Requires API key from perma.cc (free for limited use).
Used by US courts and legal professionals.
"""
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.perma.cc/v1"
self.headers = {
'Authorization': f'ApiKey {api_key}',
'Content-Type': 'application/json'
}
def create_archive(self, url: str, folder_id: int = None) -> Optional[dict]:
"""Create a new Perma.cc archive.
Returns dict with guid, creation_timestamp, and captures.
"""
data = {'url': url}
if folder_id:
data['folder'] = folder_id
try:
response = requests.post(
f"{self.base_url}/archives/",
json=data,
headers=self.headers,
timeout=60
)
if response.status_code == 201:
result = response.json()
return {
'guid': result['guid'],
'url': f"https://perma.cc/{result['guid']}",
'creation_timestamp': result['creation_timestamp'],
'title': result.get('title', '')
}
return None
except Exception:
return None
def get_archive(self, guid: str) -> Optional[dict]:
"""Retrieve archive metadata by GUID."""
try:
response = requests.get(
f"{self.base_url}/archives/{guid}/",
headers=self.headers,
timeout=30
)
return response.json() if response.status_code == 200 else None
except Exception:
return None// Save to Wayback Machine - add as bookmark
javascript:(function(){
var url = location.href;
window.open('https://web.archive.org/save/' + url, '_blank');
})();
// Save to Archive.today
javascript:(function(){
var url = location.href;
window.open('https://archive.today/?run=1&url=' + encodeURIComponent(url), '_blank');
})();
// Check all archives (Memento)
javascript:(function(){
var url = location.href;
window.open('http://timetravel.mementoweb.org/list/0/' + url, '_blank');
})();// Try multiple archives for dead pages
javascript:(function(){
var url = location.href;
var archives = [
'https://web.archive.org/web/*/' + url,
'https://archive.today/' + encodeURIComponent(url),
'https://webcache.googleusercontent.com/search?q=cache:' + url,
'http://timetravel.mementoweb.org/list/0/' + url
];
archives.forEach(function(a){ window.open(a, '_blank'); });
})();| Service | Best For | API | Deletions | Max Size |
|---|---|---|---|---|
| Wayback Machine | Historical research | Yes (free) | On request | Unlimited |
| Archive.today | Paywall bypass, quick saves | No | Never | 50MB |
| Perma.cc | Legal citations | Yes (free tier) | By creator | Standard pages |
| ArchiveBox | Self-hosted, privacy | Local | Never | Disk space |
| Conifer | Interactive content | Yes | By creator | 5GB free |
from enum import Enum
from typing import Optional
class ArchiveError(Enum):
NOT_FOUND = "No archive found"
RATE_LIMITED = "Rate limited by service"
BLOCKED = "URL blocked from archiving"
TIMEOUT = "Request timed out"
SERVICE_DOWN = "Archive service unavailable"
def get_archived_page(url: str) -> tuple[Optional[str], Optional[ArchiveError]]:
"""Try all archive services with proper error handling."""
# 1. Try Wayback Machine first
try:
result = check_wayback_availability(url)
if result and result.get('available'):
return result['url'], None
except requests.Timeout:
pass # Try next service
except Exception:
pass
# 2. Try Archive.today
try:
result = search_archive_today(url)
if result:
return result, None
except Exception:
pass
# 3. Try Memento aggregator
try:
memento_url = f"http://timetravel.mementoweb.org/api/json/0/{url}"
response = requests.get(memento_url, timeout=30)
data = response.json()
if data.get('mementos', {}).get('closest'):
return data['mementos']['closest']['uri'][0], None
except Exception:
pass
return None, ArchiveError.NOT_FOUNDdef ensure_archived(url: str) -> bool:
"""Ensure URL is archived in at least 2 services."""
archiver = MultiArchiver()
results = archiver.archive_url(url)
successful = [r for r in results if r.success]
return len(successful) >= 2robots.txt