rust_browser/scripts/import_wpt_reftests.py

#!/usr/bin/env python3
"""Bulk-import WPT CSS reftests into the project's wpt_manifest.toml.

Three phases:
  1. Clone: sparse-checkout the upstream WPT repo (minimal download)
  2. Scan:  find qualifying reftests (no JS, no external resources)
  3. Import: copy fixtures and append manifest entries as known_fail

Usage:
  python3 scripts/import_wpt_reftests.py [--scan-only] [--dry-run] [--modules M] [--max-tests N]
"""

import argparse
import os
import re
import shutil
import subprocess
import sys
from html.parser import HTMLParser
from pathlib import Path

WPT_REPO = "https://github.com/web-platform-tests/wpt.git"
PROJECT_ROOT = Path(__file__).resolve().parent.parent
UPSTREAM_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "upstream"
FIXTURES_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "fixtures"
MANIFEST_PATH = PROJECT_ROOT / "tests" / "external" / "wpt" / "wpt_manifest.toml"

# CSS directories to sparse-checkout
MODULE_DIRS = {
    "CSS2-box-display": "css/CSS2/box-display",
    "CSS2-margin-padding-clear": "css/CSS2/margin-padding-clear",
    "CSS2-normal-flow": "css/CSS2/normal-flow",
    "CSS2-positioning": "css/CSS2/positioning",
    "CSS2-floats": "css/CSS2/floats",
    "CSS2-floats-clear": "css/CSS2/floats-clear",
    "css-box": "css/css-box",
    "css-display": "css/css-display",
    "css-backgrounds": "css/css-backgrounds",
    "css-text": "css/css-text",
    "css-flexbox": "css/css-flexbox",
    "css-inline": "css/css-inline",
    "css-tables": "css/css-tables",
    "css-position": "css/css-position",
}

# Tags whose presence disqualifies a test
DISQUALIFYING_TAGS = frozenset([
    "script", "img", "video", "canvas", "iframe", "object", "embed",
    "svg", "audio", "source", "picture", "math",
])

# CSS patterns that hint at unsupported features (for reason/flags, not disqualifying)
UNSUPPORTED_CSS_PATTERNS = [
    (r"::before|::after", "pseudo-elements"),
    (r"::first-line|::first-letter", "pseudo-elements"),
    (r":hover|:focus|:active|:visited", "dynamic-pseudo-classes"),
    (r"@media", "media-queries"),
    (r"@keyframes|animation", "animations"),
    (r"@font-face", "font-face"),
    (r"transition\s*:", "transitions"),
    (r"transform\s*:", "transforms"),
    (r"text-decoration", "text-decoration"),
    (r"text-shadow", "text-shadow"),
    (r"box-shadow", "box-shadow"),
    (r"opacity\s*:", "opacity"),
    (r"counter-reset|counter-increment|content\s*:", "generated-content"),
    (r"writing-mode|direction\s*:", "writing-modes"),
    (r"column-count|column-width|columns\s*:", "multi-column"),
    (r"grid-template|grid-area|display\s*:\s*grid", "grid"),
    (r"filter\s*:", "filters"),
    (r"clip-path", "clip-path"),
    (r"outline\s*:", "outline"),
    (r"list-style", "list-style"),
    (r"word-spacing|letter-spacing", "text-spacing"),
    (r"white-space\s*:", "white-space"),
    (r"text-transform", "text-transform"),
    (r"vertical-align", "vertical-align"),
    (r"line-height", "line-height"),
    (r"text-indent", "text-indent"),
    (r"visibility\s*:", "visibility"),
    (r"cursor\s*:", "cursor"),
    (r"table-layout", "table-layout"),
    (r"border-collapse|border-spacing", "table-borders"),
    (r"min-width|max-width|min-height|max-height", "min-max-sizing"),
    (r"overflow\s*:", "overflow"),
    (r"z-index", "z-index"),
]


class ReftestScanner(HTMLParser):
    """Extract metadata from a WPT reftest HTML file."""

    def __init__(self):
        super().__init__()
        self.match_refs = []  # href values from <link rel="match">
        self.has_mismatch = False
        self.has_script = False
        self.has_external_css = False
        self.has_disqualifying_tag = False
        self.style_content = []
        self._in_style = False
        self.title = ""
        self._in_title = False

    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        tag_lower = tag.lower()

        if tag_lower == "link":
            rel = attrs_dict.get("rel", "").lower()
            href = attrs_dict.get("href", "")
            if rel == "match":
                self.match_refs.append(href)
            elif rel == "mismatch":
                self.has_mismatch = True
            elif rel == "stylesheet":
                self.has_external_css = True

        if tag_lower in DISQUALIFYING_TAGS:
            if tag_lower == "script":
                self.has_script = True
            else:
                self.has_disqualifying_tag = True

        if tag_lower == "style":
            self._in_style = True

        if tag_lower == "title":
            self._in_title = True

    def handle_endtag(self, tag):
        if tag.lower() == "style":
            self._in_style = False
        if tag.lower() == "title":
            self._in_title = False

    def handle_data(self, data):
        if self._in_style:
            self.style_content.append(data)
        if self._in_title:
            self.title += data

    def get_css(self):
        return "\n".join(self.style_content)


def check_css_imports(css_text):
    """Return True if CSS has disqualifying @import or url() references."""
    if re.search(r"@import\b", css_text):
        return True
    # Allow data: URLs, disqualify everything else
    for m in re.finditer(r"url\s*\(([^)]*)\)", css_text):
        url_content = m.group(1).strip().strip("'\"")
        if url_content and not url_content.startswith("data:"):
            return True
    return False


def detect_css_features(css_text):
    """Return list of detected CSS feature flags."""
    flags = set()
    for pattern, flag in UNSUPPORTED_CSS_PATTERNS:
        if re.search(pattern, css_text, re.IGNORECASE):
            flags.add(flag)
    return sorted(flags)


def scan_html_file(filepath):
    """Scan an HTML file and return (scanner, error_or_None)."""
    try:
        content = filepath.read_text(encoding="utf-8", errors="replace")
    except (OSError, UnicodeDecodeError) as e:
        return None, str(e)

    scanner = ReftestScanner()
    try:
        scanner.feed(content)
    except Exception as e:
        return None, f"parse error: {e}"

    return scanner, None


def qualifies(filepath, upstream_root):
    """Check if a file qualifies as an importable reftest.

    Returns (ok, ref_path_or_None, disqualify_reason).
    """
    scanner, err = scan_html_file(filepath)
    if err:
        return False, None, f"cannot read: {err}"

    if not scanner.match_refs:
        return False, None, "no <link rel='match'>"

    if scanner.has_mismatch:
        return False, None, "has mismatch ref (unsupported)"

    if scanner.has_script:
        return False, None, "contains <script>"

    if scanner.has_external_css:
        return False, None, "has external stylesheet"

    if scanner.has_disqualifying_tag:
        return False, None, "has disqualifying HTML tag"

    css = scanner.get_css()
    if check_css_imports(css):
        return False, None, "CSS has @import or external url()"

    # Check the first match reference
    ref_href = scanner.match_refs[0]
    if ref_href.startswith("/"):
        # Absolute WPT path - resolve from upstream root
        ref_path = (upstream_root / ref_href.lstrip("/")).resolve()
    else:
        ref_path = (filepath.parent / ref_href).resolve()

    if not ref_path.exists():
        return False, None, f"reference not found: {ref_href}"

    # Validate reference file too
    ref_scanner, ref_err = scan_html_file(ref_path)
    if ref_err:
        return False, None, f"cannot read reference: {ref_err}"

    if ref_scanner.has_script:
        return False, None, "reference contains <script>"

    if ref_scanner.has_external_css:
        return False, None, "reference has external stylesheet"

    if ref_scanner.has_disqualifying_tag:
        return False, None, "reference has disqualifying tag"

    ref_css = ref_scanner.get_css()
    if check_css_imports(ref_css):
        return False, None, "reference CSS has @import or external url()"

    return True, ref_path, None


def wpt_path_to_id(wpt_relpath):
    """Convert a WPT relative path to a manifest ID.

    css/css-box/margin-001.html -> wpt-css-box-margin-001
    """
    stem = Path(wpt_relpath).stem
    # Remove file extension, build path-based id
    parts = Path(wpt_relpath).parent.parts
    # Skip "css/" prefix if present for cleaner IDs, but keep module info
    id_parts = []
    for p in parts:
        id_parts.append(p)
    name = "wpt-" + "-".join(id_parts) + "-" + stem
    # Sanitize: only alphanumeric and hyphens
    name = re.sub(r"[^a-zA-Z0-9-]", "-", name)
    # Collapse multiple hyphens
    name = re.sub(r"-{2,}", "-", name)
    return name.lower().strip("-")


def load_existing_ids():
    """Load existing test IDs from the manifest."""
    ids = set()
    if not MANIFEST_PATH.exists():
        return ids
    content = MANIFEST_PATH.read_text()
    for line in content.splitlines():
        line = line.strip()
        if line.startswith("id = "):
            # Extract quoted string
            m = re.match(r'id\s*=\s*"([^"]+)"', line)
            if m:
                ids.add(m.group(1))
    return ids


def convert_xht_to_html(content):
    """Minimal .xht -> .html cleanup."""
    # Remove XML namespace declarations that browsers handle but our parser may not
    content = re.sub(r'\s+xmlns(?::[a-z]+)?="[^"]*"', "", content)
    # Replace XHTML self-closing tags
    content = re.sub(r"<(br|hr|img|input|meta|link)([^>]*)\s*/>", r"<\1\2>", content)
    return content


def phase_clone(modules):
    """Phase 1: Sparse-checkout the WPT repo."""
    if UPSTREAM_DIR.exists():
        print(f"  upstream dir exists: {UPSTREAM_DIR}")
        # Check if it's a valid git repo with our content
        git_dir = UPSTREAM_DIR / ".git"
        if git_dir.exists():
            print("  updating sparse-checkout patterns...")
            dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
            dirs.append("css/reference")
            dirs.append("css/CSS2/reference")
            if dirs:
                subprocess.run(
                    ["git", "sparse-checkout", "set", "--no-cone"] + dirs,
                    cwd=UPSTREAM_DIR,
                    check=True,
                )
            print("  done.")
            return
        else:
            print("  not a git repo, removing and re-cloning...")
            shutil.rmtree(UPSTREAM_DIR)

    print(f"  cloning WPT repo (sparse, depth=1)...")
    UPSTREAM_DIR.mkdir(parents=True, exist_ok=True)

    subprocess.run(
        [
            "git", "clone",
            "--depth", "1",
            "--filter=blob:none",
            "--sparse",
            WPT_REPO,
            str(UPSTREAM_DIR),
        ],
        check=True,
    )

    dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
    # Also include common reference directories used by reftests
    dirs.append("css/reference")
    dirs.append("css/CSS2/reference")
    if dirs:
        subprocess.run(
            ["git", "sparse-checkout", "set", "--no-cone"] + dirs,
            cwd=UPSTREAM_DIR,
            check=True,
        )

    print("  clone complete.")


def phase_scan(modules):
    """Phase 2: Scan for qualifying reftests. Returns list of candidates."""
    candidates = []
    stats = {"total_files": 0, "qualified": 0, "disqualified": {}}

    for module_key in modules:
        if module_key not in MODULE_DIRS:
            print(f"  warning: unknown module '{module_key}', skipping")
            continue

        module_path = UPSTREAM_DIR / MODULE_DIRS[module_key]
        if not module_path.exists():
            print(f"  warning: {MODULE_DIRS[module_key]} not found in upstream, skipping")
            continue

        # Walk for .html and .xht files
        for filepath in sorted(module_path.rglob("*")):
            if filepath.suffix.lower() not in (".html", ".xht", ".xhtml", ".htm"):
                continue

            # Skip reference files (commonly named *-ref.html or *-ref.xht)
            if "-ref" in filepath.stem or filepath.stem.endswith("ref"):
                continue

            # Skip support/helper files
            if "support" in filepath.parts or "reference" in filepath.parts:
                continue

            stats["total_files"] += 1

            wpt_relpath = filepath.relative_to(UPSTREAM_DIR)
            ok, ref_path, reason = qualifies(filepath, UPSTREAM_DIR)

            if not ok:
                stats["disqualified"][reason] = stats["disqualified"].get(reason, 0) + 1
                continue

            stats["qualified"] += 1

            # Detect CSS features for flags/reason
            scanner, _ = scan_html_file(filepath)
            css = scanner.get_css() if scanner else ""
            ref_scanner, _ = scan_html_file(ref_path)
            ref_css = ref_scanner.get_css() if ref_scanner else ""
            features = detect_css_features(css + "\n" + ref_css)

            candidates.append({
                "wpt_relpath": str(wpt_relpath),
                "test_path": filepath,
                "ref_path": ref_path,
                "module": module_key,
                "features": features,
                "title": scanner.title.strip() if scanner else "",
            })

    print(f"\n  Scan results:")
    print(f"    Files examined:  {stats['total_files']}")
    print(f"    Qualified:       {stats['qualified']}")
    print(f"    Disqualified:    {stats['total_files'] - stats['qualified']}")
    if stats["disqualified"]:
        print(f"    Disqualification reasons:")
        for reason, count in sorted(stats["disqualified"].items(), key=lambda x: -x[1]):
            print(f"      {count:4d}  {reason}")

    return candidates


def phase_import(candidates, dry_run=False):
    """Phase 3: Copy fixtures and append to manifest."""
    existing_ids = load_existing_ids()
    imported = 0
    skipped_existing = 0

    FIXTURES_DIR.mkdir(parents=True, exist_ok=True)

    manifest_entries = []

    for c in candidates:
        test_id = wpt_path_to_id(c["wpt_relpath"])

        if test_id in existing_ids:
            skipped_existing += 1
            continue

        test_src = c["test_path"]
        ref_src = c["ref_path"]

        # Determine fixture filenames
        test_dest = FIXTURES_DIR / f"{test_id}-test.html"
        ref_dest = FIXTURES_DIR / f"{test_id}-ref.html"

        # Build reason string
        reason_parts = [f"upstream: {c['wpt_relpath']}"]
        if c["features"]:
            reason_parts.append(f"features: {', '.join(c['features'])}")
        reason = "; ".join(reason_parts)

        # Build flags
        flags = list(set([c["module"]] + c["features"]))

        if dry_run:
            print(f"  [dry-run] would import: {test_id}")
            print(f"            from: {c['wpt_relpath']}")
            print(f"            flags: {flags}")
        else:
            # Copy test file
            test_content = test_src.read_text(encoding="utf-8", errors="replace")
            if test_src.suffix.lower() in (".xht", ".xhtml"):
                test_content = convert_xht_to_html(test_content)
            test_dest.write_text(test_content, encoding="utf-8")

            # Copy reference file
            ref_content = ref_src.read_text(encoding="utf-8", errors="replace")
            if ref_src.suffix.lower() in (".xht", ".xhtml"):
                ref_content = convert_xht_to_html(ref_content)
            ref_dest.write_text(ref_content, encoding="utf-8")

            # Build manifest entry
            flags_str = ", ".join(f'"{f}"' for f in sorted(flags))
            entry = (
                f"\n[[case]]\n"
                f'id = "{test_id}"\n'
                f'input = "fixtures/{test_dest.name}"\n'
                f'mode = "reftest"\n'
                f'reference = "fixtures/{ref_dest.name}"\n'
                f'status = "known_fail"\n'
                f'reason = "{reason}"\n'
                f"flags = [{flags_str}]\n"
            )
            manifest_entries.append(entry)

        existing_ids.add(test_id)
        imported += 1

    if manifest_entries and not dry_run:
        with open(MANIFEST_PATH, "a") as f:
            for entry in manifest_entries:
                f.write(entry)

    print(f"\n  Import results:")
    print(f"    Imported:        {imported}")
    print(f"    Skipped (exist): {skipped_existing}")
    if not dry_run and imported > 0:
        print(f"    Manifest updated: {MANIFEST_PATH}")


def main():
    parser = argparse.ArgumentParser(description="Bulk-import WPT CSS reftests")
    parser.add_argument(
        "--scan-only",
        action="store_true",
        help="Only scan and report; do not import",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be imported without writing files",
    )
    parser.add_argument(
        "--modules",
        type=str,
        default=None,
        help="Comma-separated list of module keys to import (default: all)",
    )
    parser.add_argument(
        "--max-tests",
        type=int,
        default=None,
        help="Maximum number of tests to import",
    )
    args = parser.parse_args()

    modules = list(MODULE_DIRS.keys())
    if args.modules:
        modules = [m.strip() for m in args.modules.split(",")]
        unknown = [m for m in modules if m not in MODULE_DIRS]
        if unknown:
            print(f"Error: unknown modules: {unknown}")
            print(f"Available: {list(MODULE_DIRS.keys())}")
            sys.exit(1)

    print("Phase 1: Clone/update upstream WPT repo")
    phase_clone(modules)

    print("\nPhase 2: Scan for qualifying reftests")
    candidates = phase_scan(modules)

    if args.max_tests and len(candidates) > args.max_tests:
        print(f"\n  Limiting to {args.max_tests} tests (of {len(candidates)} found)")
        candidates = candidates[: args.max_tests]

    if args.scan_only:
        print("\n  --scan-only: stopping before import")
        print(f"\n  Qualified tests by module:")
        by_module = {}
        for c in candidates:
            by_module.setdefault(c["module"], []).append(c)
        for module, tests in sorted(by_module.items()):
            print(f"    {module}: {len(tests)}")
        return

    print("\nPhase 3: Import reftests")
    phase_import(candidates, dry_run=args.dry_run)

    print("\nDone.")


if __name__ == "__main__":
    main()