From 8c5367887e5ff0be5ea64dda5c8f3c80b9735636 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 30 Jul 2025 13:29:19 +0000 Subject: [PATCH 1/8] Initial plan From 2b3117ccd393cd603b9c93b4232d46cc4edb1919 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 30 Jul 2025 13:39:52 +0000 Subject: [PATCH 2/8] Implement comprehensive link checker with image focus and CI/CD integration Co-authored-by: dannystaple <426859+dannystaple@users.noreply.github.com> --- .github/workflows/nightly_linkcheck.yml | 54 ++++++++ .github/workflows/pr_linkcheck.yml | 175 ++++++++++++++++++++++++ README.md | 16 +++ docker-compose.yml | 19 ++- link_reports/.gitignore | 4 + linkchecker/Dockerfile | 12 ++ linkchecker/README.md | 138 +++++++++++++++++++ linkchecker/filter_csv.py | 78 +++++++++++ linkchecker/linkchecker.conf | 37 +++++ linkchecker/output_template.html | 131 ++++++++++++++++++ linkchecker/run_linkcheck.sh | 54 ++++++++ scripts/local_linkcheck.sh | 45 ++++++ serve.Dockerfile | 3 + 13 files changed, 762 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/nightly_linkcheck.yml create mode 100644 .github/workflows/pr_linkcheck.yml create mode 100644 link_reports/.gitignore create mode 100644 linkchecker/Dockerfile create mode 100644 linkchecker/README.md create mode 100644 linkchecker/filter_csv.py create mode 100644 linkchecker/linkchecker.conf create mode 100644 linkchecker/output_template.html create mode 100755 linkchecker/run_linkcheck.sh create mode 100755 scripts/local_linkcheck.sh diff --git a/.github/workflows/nightly_linkcheck.yml b/.github/workflows/nightly_linkcheck.yml new file mode 100644 index 00000000..a5226ba2 --- /dev/null +++ b/.github/workflows/nightly_linkcheck.yml @@ -0,0 +1,54 @@ +name: Nightly Link Check + +on: + schedule: + # Run every night at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: + # Allow manual trigger + +jobs: + linkcheck: + name: Check Links on Production Site + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run Link Checker on Production Site + run: | + docker run --rm \ + -v ${{ github.workspace }}/linkchecker:/linkchecker \ + -v ${{ github.workspace }}/link_reports:/tmp/reports \ + ubuntu:22.04 bash -c " + apt-get update && apt-get install -y ca-certificates linkchecker python3-pip curl + pip3 install jinja2 + cd /linkchecker + /linkchecker/run_linkcheck.sh https://orionrobots.co.uk /tmp/reports + " + + - name: Upload Link Check Report + uses: actions/upload-artifact@v4 + if: always() + with: + name: nightly-link-check-report-${{ github.run_number }} + path: link_reports/ + retention-days: 30 + + - name: Check for broken links + run: | + if [ -f "linkchecker/output.csv" ]; then + total_lines=$(wc -l < linkchecker/output.csv) + if [ "$total_lines" -gt 1 ]; then + broken_count=$((total_lines - 1)) + echo "โ Found $broken_count broken links" + echo "::warning::Found $broken_count broken links on production site" + # Create issue if many broken links + if [ "$broken_count" -gt 10 ]; then + echo "::error::Too many broken links ($broken_count) found on production site" + fi + else + echo "โ No broken links found!" + fi + fi \ No newline at end of file diff --git a/.github/workflows/pr_linkcheck.yml b/.github/workflows/pr_linkcheck.yml new file mode 100644 index 00000000..bee28a31 --- /dev/null +++ b/.github/workflows/pr_linkcheck.yml @@ -0,0 +1,175 @@ +name: PR Link Check + +on: + pull_request: + types: [labeled, synchronize, reopened] + +jobs: + check-label: + name: Check for link-check label + runs-on: ubuntu-latest + outputs: + should-run: ${{ steps.check.outputs.should-run }} + steps: + - name: Check for link-check label + id: check + run: | + if [[ "${{ contains(github.event.pull_request.labels.*.name, 'link-check') }}" == "true" ]]; then + echo "should-run=true" >> $GITHUB_OUTPUT + else + echo "should-run=false" >> $GITHUB_OUTPUT + fi + + deploy-staging: + name: Deploy Staging for Link Check + runs-on: ubuntu-latest + needs: check-label + if: needs.check-label.outputs.should-run == 'true' + outputs: + deployment-url: ${{ steps.deploy.outputs.deployment-url }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + cache: 'npm' + + - name: Install dependencies + run: npm ci + + - name: Build site + run: | + npm run dist + npm run 11ty + + - name: Deploy to staging + id: deploy + run: | + # Create a unique staging URL for this PR + STAGING_URL="https://pr-${{ github.event.number }}-orionrobots.surge.sh" + echo "deployment-url=$STAGING_URL" >> $GITHUB_OUTPUT + + # Install surge for deployment + npm install -g surge + + # Deploy to surge.sh with the PR-specific URL + surge _site $STAGING_URL --token ${{ secrets.SURGE_TOKEN }} + + echo "๐ Deployed to: $STAGING_URL" + + linkcheck: + name: Run Link Check on Staging + runs-on: ubuntu-latest + needs: [check-label, deploy-staging] + if: needs.check-label.outputs.should-run == 'true' + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Wait for deployment + run: | + echo "โณ Waiting for staging site to be available..." + STAGING_URL="${{ needs.deploy-staging.outputs.deployment-url }}" + + # Wait up to 5 minutes for the site to be available + timeout 300 bash -c "until curl -s '$STAGING_URL' > /dev/null; do sleep 10; done" || { + echo "โ Staging site not available at $STAGING_URL" + exit 1 + } + + echo "โ Staging site is available" + + - name: Run Link Checker on Staging + run: | + STAGING_URL="${{ needs.deploy-staging.outputs.deployment-url }}" + + docker run --rm \ + -v ${{ github.workspace }}/linkchecker:/linkchecker \ + -v ${{ github.workspace }}/link_reports:/tmp/reports \ + ubuntu:22.04 bash -c " + apt-get update && apt-get install -y ca-certificates linkchecker python3-pip curl + pip3 install jinja2 + cd /linkchecker + /linkchecker/run_linkcheck.sh '$STAGING_URL' /tmp/reports + " + + - name: Upload Link Check Report + uses: actions/upload-artifact@v4 + if: always() + with: + name: pr-link-check-report-${{ github.event.number }} + path: link_reports/ + retention-days: 14 + + - name: Comment on PR with results + uses: actions/github-script@v7 + if: always() + with: + script: | + const fs = require('fs'); + const path = './linkchecker/output.csv'; + + let message = '## ๐ Link Check Results\n\n'; + + if (fs.existsSync(path)) { + const lines = fs.readFileSync(path, 'utf8').split('\n').filter(line => line.trim()); + if (lines.length > 1) { + const brokenCount = lines.length - 1; // Subtract header + message += `โ **Found ${brokenCount} broken links**\n\n`; + message += `๐ [View detailed report](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})\n\n`; + message += `๐ **Staging URL:** ${{ needs.deploy-staging.outputs.deployment-url }}\n\n`; + + if (brokenCount <= 10) { + message += '### Broken Links:\n'; + const csvContent = fs.readFileSync(path, 'utf8'); + const rows = csvContent.split('\n').slice(1, 11); // Show first 10 + for (const row of rows) { + if (row.trim()) { + const cols = row.split(';'); + if (cols.length >= 3) { + message += `- **${cols[1]}** in ${cols[0]} - ${cols[2]}\n`; + } + } + } + if (brokenCount > 10) { + message += `\n... and ${brokenCount - 10} more. See full report above.\n`; + } + } + } else { + message += 'โ **No broken links found!**\n\n'; + message += `๐ **Staging URL:** ${{ needs.deploy-staging.outputs.deployment-url }}\n`; + } + } else { + message += 'โ ๏ธ **Link check could not be completed**\n\n'; + message += 'Please check the workflow logs for more information.\n'; + } + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: message + }); + + cleanup: + name: Cleanup Staging Deployment + runs-on: ubuntu-latest + needs: [check-label, deploy-staging, linkcheck] + if: always() && needs.check-label.outputs.should-run == 'true' && needs.deploy-staging.outputs.deployment-url + + steps: + - name: Teardown staging deployment + run: | + # Install surge for teardown + npm install -g surge + + # Teardown the staging deployment + STAGING_URL="${{ needs.deploy-staging.outputs.deployment-url }}" + surge teardown $STAGING_URL --token ${{ secrets.SURGE_TOKEN }} + + echo "๐งน Cleaned up staging deployment: $STAGING_URL" \ No newline at end of file diff --git a/README.md b/README.md index 740505fe..cd07e92a 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,22 @@ docker compose run shell **Note:** `node_modules` are managed inside the container. You do not need to run `npm install` on your host. +### Link Checking + +The project includes integrated link checking to detect broken links, with a focus on images: + +```bash +# Run link checker locally +./scripts/local_linkcheck.sh +``` + +For more details, see [linkchecker/README.md](linkchecker/README.md). + +**GitHub Actions Integration:** +- Nightly automated link checks on production +- PR-based link checks when labeled with `link-check` +- Detailed HTML reports with categorized results + ## Preparing to contribute This project uses the following tools for development: diff --git a/docker-compose.yml b/docker-compose.yml index 7e1e5d1b..6302d68c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,14 +86,25 @@ services: - ./htaccess:/usr/local/apache2/htdocs/.htaccess ports: - 8082:80 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s profiles: - manual broken_links: build: - context: . - dockerfile: serve.Dockerfile - target: broken_link_checker - command: ["http://http_serve"] + context: ./linkchecker + dockerfile: Dockerfile + command: ["/linkchecker/run_linkcheck.sh", "http://http_serve", "/reports"] + volumes: + - ./linkchecker:/linkchecker + - ./link_reports:/reports + depends_on: + http_serve: + condition: service_healthy profiles: - manual diff --git a/link_reports/.gitignore b/link_reports/.gitignore new file mode 100644 index 00000000..9b7ad73e --- /dev/null +++ b/link_reports/.gitignore @@ -0,0 +1,4 @@ +# Link checker reports directory +# This directory contains HTML reports generated by the link checker +* +!.gitignore \ No newline at end of file diff --git a/linkchecker/Dockerfile b/linkchecker/Dockerfile new file mode 100644 index 00000000..00b79503 --- /dev/null +++ b/linkchecker/Dockerfile @@ -0,0 +1,12 @@ +FROM ubuntu:22.04 +RUN apt-get -y update && \ + apt-get install -y ca-certificates linkchecker python3-pip --no-install-recommends \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +RUN pip3 install jinja2 + +WORKDIR /linkchecker +COPY filter_csv.py output_template.html linkchecker.conf ./ + +# Default command to run linkchecker +ENTRYPOINT ["linkchecker", "--config=linkchecker.conf"] diff --git a/linkchecker/README.md b/linkchecker/README.md new file mode 100644 index 00000000..7844f170 --- /dev/null +++ b/linkchecker/README.md @@ -0,0 +1,138 @@ +# OrionRobots Link Checker + +This directory contains the link checking functionality for the OrionRobots website, designed to detect broken links with a focus on image links and internal broken links. + +## ๐ฏ Features + +- **Image-focused checking**: Prioritizes broken image links that affect visual content +- **Categorized results**: Separates internal, external, image, and email links +- **HTML reports**: Generates detailed, styled reports with priority indicators +- **Docker integration**: Runs in isolated containers for consistency +- **CI/CD integration**: Automated nightly checks and PR-based checks + +## ๐ Usage + +### Local Usage + +Run the link checker locally using the provided script: + +```bash +./scripts/local_linkcheck.sh +``` + +This will: +1. Build the site +2. Start a local HTTP server +3. Run the link checker +4. Generate a report in `./link_reports/` +5. Clean up containers + +### Manual Docker Compose + +You can also run individual services manually: + +```bash +# Build and serve the site +docker compose --profile manual up -d http_serve + +# Run link checker +docker compose --profile manual up broken_links + +# View logs +docker compose logs broken_links + +# Cleanup +docker compose down +``` + +### GitHub Actions Integration + +#### Nightly Checks +- Runs every night at 2 AM UTC +- Checks the production site (https://orionrobots.co.uk) +- Creates warnings for broken links +- Uploads detailed reports as artifacts + +#### PR-based Checks +- Triggered when a PR is labeled with `link-check` +- Deploys a staging version of the PR +- Runs link checker on the staging deployment +- Comments results on the PR +- Automatically cleans up staging deployment + +To run link checking on a PR: +1. Add the `link-check` label to the PR +2. The workflow will automatically deploy staging and run checks +3. Results will be commented on the PR + +## ๐ Files + +- `Dockerfile`: Container definition for the link checker +- `linkchecker.conf`: Configuration for linkchecker tool +- `filter_csv.py`: Python script to process and categorize results +- `output_template.html`: HTML template for generating reports +- `run_linkcheck.sh`: Main script that orchestrates the checking process + +## ๐ Report Categories + +The generated reports categorize broken links by priority: + +1. **๐ผ๏ธ Images** (High Priority): Broken image links that affect visual content +2. **๐ Internal Links** (High Priority): Broken internal links under our control +3. **๐ External Links** (Medium Priority): Broken external links (may be temporary) +4. **๐ง Email Links** (Low Priority): Broken email links (complex to validate) + +## โ๏ธ Configuration + +The link checker configuration in `linkchecker.conf` includes: + +- **Recursion**: Checks up to 10 levels deep +- **Output**: CSV format for easy processing +- **Filtering**: Ignores common social media sites that block crawlers +- **Anchor checking**: Validates internal page anchors +- **Warning handling**: Configurable warning levels + +## ๐ง Customization + +To modify the link checking behavior: + +1. **Change checking depth**: Edit `recursionlevel` in `linkchecker.conf` +2. **Add ignored URLs**: Add patterns to the `ignore` section in `linkchecker.conf` +3. **Modify report styling**: Edit `output_template.html` +4. **Change categorization**: Modify `filter_csv.py` + +## ๐ณ Docker Integration + +The link checker integrates with the existing Docker Compose setup: + +- Uses the `http_serve` service as the target +- Depends on health checks to ensure site availability +- Outputs reports to a mounted volume for persistence +- Runs in the `manual` profile to avoid automatic execution + +## ๐ Requirements + +- Docker and Docker Compose +- Python 3 with Jinja2 (handled in container) +- linkchecker tool (handled in container) +- curl for health checks (handled in container) + +## ๐ Troubleshooting + +### Site not available +If you get "Site not available" errors: +1. Ensure the site builds successfully first +2. Check that the HTTP server is running +3. Verify port 8082 is not in use + +### Permission errors +If you get permission errors with volumes: +1. Check Docker permissions +2. Ensure the link_reports directory exists +3. Try running with sudo (not recommended for production) + +### Missing dependencies +If linkchecker fails to run: +1. Check the Dockerfile builds successfully +2. Verify Python dependencies are installed +3. Check linkchecker configuration syntax \ No newline at end of file diff --git a/linkchecker/filter_csv.py b/linkchecker/filter_csv.py new file mode 100644 index 00000000..49bfe1b3 --- /dev/null +++ b/linkchecker/filter_csv.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +import csv +import sys +import os +from urllib.parse import urlparse + +from jinja2 import Environment, FileSystemLoader, select_autoescape + + +def is_image_url(url): + """Check if URL points to an image file""" + image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp'} + parsed = urlparse(url) + path = parsed.path.lower() + return any(path.endswith(ext) for ext in image_extensions) + + +def categorize_link(item): + """Categorize link by type""" + url = item['url'] + if is_image_url(url): + return 'image' + elif url.startswith('mailto:'): + return 'email' + elif url.startswith('http'): + return 'external' + else: + return 'internal' + + +def output_file(items): + env = Environment( + loader=FileSystemLoader('.'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('output_template.html') + + # Categorize items + categorized = {} + for item in items: + category = categorize_link(item) + if category not in categorized: + categorized[category] = [] + categorized[category].append(item) + + print(template.render( + categorized=categorized, + total_count=len(items), + image_count=len(categorized.get('image', [])), + internal_count=len(categorized.get('internal', [])), + external_count=len(categorized.get('external', [])), + email_count=len(categorized.get('email', [])) + )) + + +def main(): + filename = sys.argv[1] if len(sys.argv) > 1 else '/linkchecker/output.csv' + + if not os.path.exists(filename): + print(f"Error: CSV file {filename} not found") + sys.exit(1) + + with open(filename, encoding='utf-8') as csv_file: + data = csv_file.readlines() + reader = csv.DictReader((row for row in data if not row.startswith('#')), delimiter=';') + + # Filter out successful links and redirects + non_200 = (item for item in reader if 'OK' not in item['result']) + non_redirect = (item for item in non_200 if '307' not in item['result'] and '301' not in item['result'] and '302' not in item['result']) + non_ssl = (item for item in non_redirect if 'ssl' not in item['result'].lower()) + + total_list = sorted(list(non_ssl), key=lambda item: (categorize_link(item), item['parentname'])) + + output_file(total_list) + + +if __name__ == '__main__': + main() diff --git a/linkchecker/linkchecker.conf b/linkchecker/linkchecker.conf new file mode 100644 index 00000000..913abafb --- /dev/null +++ b/linkchecker/linkchecker.conf @@ -0,0 +1,37 @@ +[checking] +# Check all links +recursionlevel=10 +# Focus on internal links +allowedschemes=http,https,file +# Check for broken images specifically +checkextern=1 + +[output] +# Output in CSV format for easier processing +log=csv +filename=/linkchecker/output.csv +# Also output to console +verbose=1 +warnings=1 + +[filtering] +# Ignore certain file types that might cause issues +ignorewarnings=url-whitespace,url-content-size-zero,url-content-too-large +# Skip external social media links that often block crawlers +ignore= + url:facebook\.com + url:twitter\.com + url:instagram\.com + url:linkedin\.com + url:youtube\.com + url:tiktok\.com + +[AnchorCheck] +# Check for broken internal anchors +add=1 + +[authentication] +# No authentication required for most checks + +[plugins] +# No additional plugins needed for basic checking \ No newline at end of file diff --git a/linkchecker/output_template.html b/linkchecker/output_template.html new file mode 100644 index 00000000..411466a4 --- /dev/null +++ b/linkchecker/output_template.html @@ -0,0 +1,131 @@ + + +
+Total Broken Links: {{ total_count }}
+Priority: High - These affect visual content
+Parent Page | +Image URL | +Error | +
---|---|---|
{{ item.parentname }} | +{{ item.urlname }} | +{{ item.result }} | +
Priority: High - These are under our control
+Parent Page | +Link URL | +Error | +
---|---|---|
{{ item.parentname }} | +{{ item.urlname }} | +{{ item.result }} | +
Priority: Medium - These may be temporary issues
+Parent Page | +Link URL | +Error | +
---|---|---|
{{ item.parentname }} | +{{ item.urlname }} | +{{ item.result }} | +
Priority: Low - Email validation is complex
+Parent Page | +Email URL | +Error | +
---|---|---|
{{ item.parentname }} | +{{ item.urlname }} | +{{ item.result }} | +
No broken links found. Great job!
+