This directory contains real-world examples demonstrating WhyML’s advanced web scraping capabilities including structure simplification, selective section generation, page analysis, and testing workflows.
# Structure simplification for complex sites
whyml scrape https://complex-blog.com \
--max-depth 3 \
--flatten-containers \
--simplify-structure \
-o simplified-blog.yaml
# Selective monitoring for e-commerce
whyml scrape https://store.example.com/product/123 \
--section analysis \
--section metadata \
-o product-analysis.yaml
# Testing workflow for migration projects
whyml scrape https://legacy-site.com \
--test-conversion \
--output-html modernized-preview.html \
-o migration-manifest.yaml
Examples for modernizing legacy websites and simplifying complex structures:
Monitor competitor websites and track changes:
Extract website structures for mobile and desktop app development:
Test and validate content migration accuracy:
Automated website analysis for QA processes:
Challenge: Convert a complex legacy corporate website to modern standards.
Solution: Use structure simplification to create maintainable manifests.
# Extract simplified structure
whyml scrape https://legacy-corp.com \
--max-depth 2 \
--flatten-containers \
--simplify-structure \
--section structure \
--section metadata \
-o legacy-simplified.yaml
# Test conversion accuracy
whyml scrape https://legacy-corp.com \
--test-conversion \
--max-depth 2 \
--flatten-containers \
--output-html modernized-preview.html \
-o legacy-test.yaml
Benefits:
Challenge: Track competitor product pages for pricing and feature changes.
Solution: Use selective section generation to monitor key metrics.
#!/bin/bash
# Daily competitor monitoring script
COMPETITORS=(
"https://competitor1.com/product/widget"
"https://competitor2.com/product/similar-widget"
"https://competitor3.com/product/alternative"
)
DATE=$(date +%Y%m%d)
for url in "${COMPETITORS[@]}"; do
domain=$(echo "$url" | cut -d'/' -f3)
whyml scrape "$url" \
--section analysis \
--section metadata \
-o "monitoring/${domain}-${DATE}.yaml"
done
# Generate comparison report
python analyze_competitors.py monitoring/
Benefits:
Challenge: Migrate content from legacy CMS to modern platform with accuracy validation.
Solution: Use testing workflow to validate migration quality.
# Test migration accuracy
whyml scrape https://old-cms.com/article/123 \
--test-conversion \
--section structure \
--section imports \
--section metadata \
--output-html migration-preview.html \
-o migration-test.yaml
# Batch test multiple pages
pages=(
"https://old-cms.com/article/123"
"https://old-cms.com/page/about"
"https://old-cms.com/product/widget"
)
for page in "${pages[@]}"; do
id=$(basename "$page")
whyml scrape "$page" \
--test-conversion \
--output-html "previews/migrated-${id}.html" \
-o "manifests/migration-${id}.yaml"
done
Benefits:
#!/bin/bash
# examples/scripts/continuous-monitor.sh
# Configuration
CONFIG_FILE="monitoring-config.yaml"
OUTPUT_DIR="monitoring-data"
NOTIFICATION_URL="https://hooks.slack.com/your-webhook"
# Read URLs from config
urls=$(yq eval '.urls[]' "$CONFIG_FILE")
mkdir -p "$OUTPUT_DIR"
timestamp=$(date +%Y%m%d_%H%M%S)
for url in $urls; do
domain=$(echo "$url" | sed 's|https\?://||' | cut -d'/' -f1)
output_file="${OUTPUT_DIR}/${domain}_${timestamp}.yaml"
echo "Scraping $url..."
if whyml scrape "$url" \
--section analysis \
--section metadata \
-o "$output_file"; then
echo "✅ Successfully scraped $domain"
else
echo "❌ Failed to scrape $domain"
# Send notification
curl -X POST "$NOTIFICATION_URL" \
-H 'Content-type: application/json' \
--data "{\"text\":\"Failed to scrape $domain\"}"
fi
done
echo "Monitoring complete. Data saved to $OUTPUT_DIR"
#!/bin/bash
# examples/scripts/batch-analyze.sh
# Analyze multiple websites for SEO and accessibility
websites_file="websites.txt"
output_dir="analysis-results"
mkdir -p "$output_dir"
while IFS= read -r url; do
if [[ -z "$url" || "$url" == \#* ]]; then
continue # Skip empty lines and comments
fi
domain=$(echo "$url" | sed 's|https\?://||' | sed 's|/.*||')
output_file="${output_dir}/${domain}-analysis.yaml"
echo "Analyzing $url..."
whyml scrape "$url" \
--section analysis \
--section metadata \
--verbose \
-o "$output_file"
# Extract key metrics
echo "SEO Score: $(yq eval '.analysis.seo_analysis | length' "$output_file")"
echo "Accessibility: $(yq eval '.analysis.accessibility.images_with_alt_ratio' "$output_file")"
echo "Page Type: $(yq eval '.analysis.page_type' "$output_file")"
echo "---"
done < "$websites_file"
# Generate summary report
python generate_summary.py "$output_dir"
# Example output: seo-analysis.yaml
metadata:
title: "Best Practices for Modern Web Development"
description: "Learn essential techniques for building fast, accessible websites"
url: "https://blog.example.com/web-dev-practices"
analysis:
page_type: "blog"
seo_analysis:
has_meta_description: true
meta_description_length: 156 # Optimal length
h1_count: 1 # Perfect
h2_count: 4 # Good structure
h3_count: 8 # Detailed sections
title_length: 58 # Good length
social_meta_tags:
og_title: true
og_description: true
og_image: true
twitter_card: true
accessibility:
has_lang_attribute: true
images_with_alt_ratio: 0.95 # Excellent
heading_structure_valid: true
contrast_ratio: "AA" # WCAG compliant
# Example output: performance-analysis.yaml
analysis:
content_stats:
word_count: 2400
reading_time_minutes: 10
paragraph_count: 24
heading_count: 13
link_count: 18
image_count: 8
structure_complexity:
max_nesting_depth: 5
total_elements: 156
div_count: 42
semantic_elements: ["header", "main", "article", "section", "footer"]
simplification_applied: true
complexity_reduction: 65 # 65% reduction in complexity
performance_hints:
- "Consider lazy loading for images"
- "Optimize heading structure (missing h3 after h1)"
- "Add more semantic HTML5 elements"
# monitoring-config.yaml
urls:
- "https://competitor1.com"
- "https://competitor2.com/product/widget"
- "https://reference-site.com/blog"
sections:
- "metadata"
- "analysis"
options:
max_depth: 3
flatten_containers: true
simplify_structure: true
notifications:
slack_webhook: "https://hooks.slack.com/your-webhook"
email_alerts: true
schedule:
frequency: "daily"
time: "06:00"
# analysis-pipeline.yaml
pipeline:
stages:
- name: "scrape"
options:
sections: ["analysis", "metadata"]
max_depth: 2
simplify_structure: true
- name: "analyze"
scripts:
- "scripts/seo-analysis.py"
- "scripts/accessibility-check.py"
- name: "report"
outputs:
- "reports/seo-summary.html"
- "reports/accessibility-report.json"
- name: "notify"
conditions:
- "seo_score < 80"
- "accessibility_score < 0.9"
# .github/workflows/website-monitoring.yml
name: Website Monitoring
on:
schedule:
- cron: '0 6 * * *' # Daily at 6 AM
workflow_dispatch:
jobs:
monitor:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install WhyML
run: pip install whyml
- name: Monitor Competitors
run: |
mkdir -p monitoring-results
# Monitor key competitor pages
whyml scrape https://competitor1.com/product \
--section analysis --section metadata \
-o monitoring-results/competitor1-$(date +%Y%m%d).yaml
whyml scrape https://competitor2.com/pricing \
--section analysis --section metadata \
-o monitoring-results/competitor2-$(date +%Y%m%d).yaml
- name: Generate Report
run: python scripts/generate-monitoring-report.py
- name: Upload Results
uses: actions/upload-artifact@v3
with:
name: monitoring-results
path: monitoring-results/
- name: Notify Team
if: failure()
uses: 8398a7/action-slack@v3
with:
status: failure
webhook_url: $
# Dockerfile for monitoring service
FROM python:3.9-slim
RUN pip install whyml requests pyyaml
WORKDIR /app
COPY scripts/ ./scripts/
COPY monitoring-config.yaml ./
CMD ["python", "scripts/continuous-monitor.py"]
# docker-compose.yml
version: '3.8'
services:
website-monitor:
build: .
environment:
- MONITORING_INTERVAL=3600 # 1 hour
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK}
volumes:
- ./monitoring-data:/app/data
- ./config:/app/config
restart: unless-stopped
Structure Simplification:
Selective Section Generation:
Testing Workflow:
Page Analysis: