Loading...
Loading...
Extract structured data from multiple web pages using Playwright with built-in ethical crawling practices including rate limiting, robots.txt compliance, and error monitoring. Use when asked to "scrape data from", "extract information from pages", "collect data from site", "crawl multiple pages", or when gathering structured data from websites. Supports pagination, multi-page extraction, data aggregation, and export to CSV/JSON/Markdown. Works with browser_navigate, browser_evaluate, browser_wait_for, and browser_snapshot tools.
npx skill4agent add dawiddutoit/custom-claude playwright-web-scraperplaywright-e2e-testingplaywright-console-monitorplaywright-network-analyzer// 1. Validate URLs
python scripts/validate_urls.py urls.txt
// 2. Scrape pages with rate limiting
const results = [];
for (const url of urls) {
await browser_navigate({ url });
await browser_wait_for({ time: Math.random() * 2 + 1 }); // 1-3s delay
const data = await browser_evaluate({
function: `
Array.from(document.querySelectorAll('.product')).map(el => ({
title: el.querySelector('.title')?.textContent?.trim(),
price: el.querySelector('.price')?.textContent?.trim(),
url: el.querySelector('a')?.getAttribute('href')
}))
`
});
results.push(...data);
}
// 3. Process results
python scripts/process_results.py scraped.json -o products.csvhttps://example.com/products?page=1
https://example.com/products?page=2
https://example.com/products?page=3python scripts/validate_urls.py urls.txt --user-agent "MyBot/1.0"await browser_navigate({ url: firstUrl });
await browser_snapshot();const results = [];
for (const url of urlList) {
// Navigate to page
await browser_navigate({ url });
// Wait for content to load
await browser_wait_for({ text: 'Expected content marker' });
// Add respectful delay (1-3 seconds)
const delay = Math.random() * 2 + 1;
await browser_wait_for({ time: delay });
// Extract data
const pageData = await browser_evaluate({
function: `/* extraction code */`
});
results.push(...pageData);
// Check console for errors/warnings
const console = await browser_console_messages();
// Monitor for rate limit warnings
}browser_evaluateconst data = await browser_evaluate({
function: `
try {
return Array.from(document.querySelectorAll('.item')).map(el => ({
title: el.querySelector('.title')?.textContent?.trim(),
price: el.querySelector('.price')?.textContent?.trim(),
rating: el.querySelector('.rating')?.textContent?.trim(),
url: el.querySelector('a')?.getAttribute('href')
})).filter(item => item.title && item.price); // Filter incomplete records
} catch (e) {
console.error('Extraction failed:', e);
return [];
}
`
});references/extraction-patterns.md// Check HTTP responses via browser_network_requests
const requests = await browser_network_requests();
const rateLimited = requests.some(r => r.status === 429 || r.status === 503);
if (rateLimited) {
// Back off exponentially
await browser_wait_for({ time: 10 }); // Wait 10 seconds
// Retry or skip
}
// Check console for blocking messages
const console = await browser_console_messages({ pattern: 'rate limit|blocked|captcha' });
if (console.length > 0) {
// Handle blocking
}// In your scraping script
fs.writeFileSync('scraped.json', JSON.stringify({ results }, null, 2));# View statistics
python scripts/process_results.py scraped.json --stats
# Convert to CSV
python scripts/process_results.py scraped.json -o output.csv
# Convert to Markdown table
python scripts/process_results.py scraped.json -o output.md// Random delay between 1-3 seconds
const randomDelay = () => Math.random() * 2 + 1;
await browser_wait_for({ time: randomDelay() });
// Exponential backoff after rate limit
let backoffSeconds = 5;
for (let retry = 0; retry < 3; retry++) {
try {
await browser_navigate({ url });
break; // Success
} catch (e) {
await browser_wait_for({ time: backoffSeconds });
backoffSeconds *= 2; // Double delay each retry
}
}| Response Code | Action |
|---|---|
| 200 OK | Continue with normal delay (1-3s) |
| 429 Too Many Requests | Increase delay to 10s, retry |
| 503 Service Unavailable | Wait 60s, then retry |
| 403 Forbidden | Stop scraping this domain |
references/ethical-scraping.mdvalidate_urls.py# Basic validation
python scripts/validate_urls.py urls.txt
# Check robots.txt with specific user agent
python scripts/validate_urls.py urls.txt --user-agent "MyBot/1.0"
# Strict mode (exit on any invalid/disallowed URL)
python scripts/validate_urls.py urls.txt --strict// Single page extraction
const data = await browser_evaluate({
function: `
Array.from(document.querySelectorAll('.item')).map(el => ({
field1: el.querySelector('.selector1')?.textContent?.trim(),
field2: el.querySelector('.selector2')?.getAttribute('href')
}))
`
});let hasMore = true;
let page = 1;
while (hasMore) {
await browser_navigate({ url: `${baseUrl}?page=${page}` });
await browser_wait_for({ time: randomDelay() });
const pageData = await browser_evaluate({ function: extractionCode });
results.push(...pageData);
// Check for next page
hasMore = await browser_evaluate({
function: `document.querySelector('.next:not(.disabled)') !== null`
});
page++;
}references/extraction-patterns.mdtry {
await browser_navigate({ url });
} catch (e) {
console.error(`Failed to load ${url}:`, e);
failedUrls.push(url);
continue; // Skip to next URL
}const data = await browser_evaluate({ function: extractionCode });
if (!data || data.length === 0) {
console.warn(`No data extracted from ${url}`);
// Log for manual review
}
// Validate data structure
const validData = data.filter(item =>
item.title && item.price // Ensure required fields exist
);// Monitor console
const console = await browser_console_messages({
pattern: 'error|rate|limit|captcha',
onlyErrors: true
});
if (console.length > 0) {
console.log('Warnings detected:', console);
}
// Monitor network
const requests = await browser_network_requests();
const errors = requests.filter(r => r.status >= 400);python scripts/process_results.py scraped.json --stats📊 Statistics:
Total records: 150
Fields (5): title, price, rating, url, image
Sample record: {...}# To CSV
python scripts/process_results.py scraped.json -o products.csv
# To JSON (compact)
python scripts/process_results.py scraped.json -o products.json --compact
# To Markdown table
python scripts/process_results.py scraped.json -o products.mdpython scripts/process_results.py scraped.json -o products.csv --statsscripts/validate_urls.pyscripts/process_results.pyreferences/ethical-scraping.mdreferences/extraction-patterns.md✅ Validated 50 URLs
✅ Scraped 50 pages in 5 minutes (6 req/min)
✅ Extracted 1,250 products
✅ Zero rate limit errors
✅ Exported to products.csv (1,250 rows)⚠️ Validated 50 URLs (2 disallowed by robots.txt)
✅ Scraped 48 pages
⚠️ 3 pages returned no data (logged for review)
✅ Extracted 1,100 products
⚠️ 1 rate limit warning (backed off successfully)
✅ Exported to products.csv (1,100 rows)❌ Rate limited after 20 pages (429 responses)
✅ Backed off exponentially (5s → 10s → 20s)
✅ Resumed scraping successfully
✅ Extracted 450 products from 25 pages| Metric | Before | After |
|---|---|---|
| Setup time | 30-45 min | 5-10 min |
| Rate limit errors | Common | Rare |
| robots.txt violations | Possible | Prevented |
| Data format conversion | Manual | Automated |
| Error detection | Manual review | Automated monitoring |
validate_urls.py