Files
rust_browser/scripts/import_wpt_reftests.py
Zachary D. Rowitsch 16abbd78e7 Bulk-import 2899 WPT CSS reftests and add import tooling
Add scripts/import_wpt_reftests.py to sparse-clone the upstream WPT repo
and bulk-import qualifying CSS reftests (no JS, no external resources) as
known_fail entries. 23 tests already pass and are promoted. The import
script is idempotent and exposed via `just import-wpt`. CI now prints the
WPT summary (pass=36 known_fail=2877 skip=1) on every run.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 00:15:27 -05:00

550 lines
18 KiB
Python

#!/usr/bin/env python3
"""Bulk-import WPT CSS reftests into the project's wpt_manifest.toml.
Three phases:
1. Clone: sparse-checkout the upstream WPT repo (minimal download)
2. Scan: find qualifying reftests (no JS, no external resources)
3. Import: copy fixtures and append manifest entries as known_fail
Usage:
python3 scripts/import_wpt_reftests.py [--scan-only] [--dry-run] [--modules M] [--max-tests N]
"""
import argparse
import os
import re
import shutil
import subprocess
import sys
from html.parser import HTMLParser
from pathlib import Path
WPT_REPO = "https://github.com/web-platform-tests/wpt.git"
PROJECT_ROOT = Path(__file__).resolve().parent.parent
UPSTREAM_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "upstream"
FIXTURES_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "fixtures"
MANIFEST_PATH = PROJECT_ROOT / "tests" / "external" / "wpt" / "wpt_manifest.toml"
# CSS directories to sparse-checkout
MODULE_DIRS = {
"CSS2-box-display": "css/CSS2/box-display",
"CSS2-margin-padding-clear": "css/CSS2/margin-padding-clear",
"CSS2-normal-flow": "css/CSS2/normal-flow",
"CSS2-positioning": "css/CSS2/positioning",
"CSS2-floats": "css/CSS2/floats",
"CSS2-floats-clear": "css/CSS2/floats-clear",
"css-box": "css/css-box",
"css-display": "css/css-display",
"css-backgrounds": "css/css-backgrounds",
"css-text": "css/css-text",
"css-flexbox": "css/css-flexbox",
"css-inline": "css/css-inline",
"css-tables": "css/css-tables",
"css-position": "css/css-position",
}
# Tags whose presence disqualifies a test
DISQUALIFYING_TAGS = frozenset([
"script", "img", "video", "canvas", "iframe", "object", "embed",
"svg", "audio", "source", "picture", "math",
])
# CSS patterns that hint at unsupported features (for reason/flags, not disqualifying)
UNSUPPORTED_CSS_PATTERNS = [
(r"::before|::after", "pseudo-elements"),
(r"::first-line|::first-letter", "pseudo-elements"),
(r":hover|:focus|:active|:visited", "dynamic-pseudo-classes"),
(r"@media", "media-queries"),
(r"@keyframes|animation", "animations"),
(r"@font-face", "font-face"),
(r"transition\s*:", "transitions"),
(r"transform\s*:", "transforms"),
(r"text-decoration", "text-decoration"),
(r"text-shadow", "text-shadow"),
(r"box-shadow", "box-shadow"),
(r"opacity\s*:", "opacity"),
(r"counter-reset|counter-increment|content\s*:", "generated-content"),
(r"writing-mode|direction\s*:", "writing-modes"),
(r"column-count|column-width|columns\s*:", "multi-column"),
(r"grid-template|grid-area|display\s*:\s*grid", "grid"),
(r"filter\s*:", "filters"),
(r"clip-path", "clip-path"),
(r"outline\s*:", "outline"),
(r"list-style", "list-style"),
(r"word-spacing|letter-spacing", "text-spacing"),
(r"white-space\s*:", "white-space"),
(r"text-transform", "text-transform"),
(r"vertical-align", "vertical-align"),
(r"line-height", "line-height"),
(r"text-indent", "text-indent"),
(r"visibility\s*:", "visibility"),
(r"cursor\s*:", "cursor"),
(r"table-layout", "table-layout"),
(r"border-collapse|border-spacing", "table-borders"),
(r"min-width|max-width|min-height|max-height", "min-max-sizing"),
(r"overflow\s*:", "overflow"),
(r"z-index", "z-index"),
]
class ReftestScanner(HTMLParser):
"""Extract metadata from a WPT reftest HTML file."""
def __init__(self):
super().__init__()
self.match_refs = [] # href values from <link rel="match">
self.has_mismatch = False
self.has_script = False
self.has_external_css = False
self.has_disqualifying_tag = False
self.style_content = []
self._in_style = False
self.title = ""
self._in_title = False
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
tag_lower = tag.lower()
if tag_lower == "link":
rel = attrs_dict.get("rel", "").lower()
href = attrs_dict.get("href", "")
if rel == "match":
self.match_refs.append(href)
elif rel == "mismatch":
self.has_mismatch = True
elif rel == "stylesheet":
self.has_external_css = True
if tag_lower in DISQUALIFYING_TAGS:
if tag_lower == "script":
self.has_script = True
else:
self.has_disqualifying_tag = True
if tag_lower == "style":
self._in_style = True
if tag_lower == "title":
self._in_title = True
def handle_endtag(self, tag):
if tag.lower() == "style":
self._in_style = False
if tag.lower() == "title":
self._in_title = False
def handle_data(self, data):
if self._in_style:
self.style_content.append(data)
if self._in_title:
self.title += data
def get_css(self):
return "\n".join(self.style_content)
def check_css_imports(css_text):
"""Return True if CSS has disqualifying @import or url() references."""
if re.search(r"@import\b", css_text):
return True
# Allow data: URLs, disqualify everything else
for m in re.finditer(r"url\s*\(([^)]*)\)", css_text):
url_content = m.group(1).strip().strip("'\"")
if url_content and not url_content.startswith("data:"):
return True
return False
def detect_css_features(css_text):
"""Return list of detected CSS feature flags."""
flags = set()
for pattern, flag in UNSUPPORTED_CSS_PATTERNS:
if re.search(pattern, css_text, re.IGNORECASE):
flags.add(flag)
return sorted(flags)
def scan_html_file(filepath):
"""Scan an HTML file and return (scanner, error_or_None)."""
try:
content = filepath.read_text(encoding="utf-8", errors="replace")
except (OSError, UnicodeDecodeError) as e:
return None, str(e)
scanner = ReftestScanner()
try:
scanner.feed(content)
except Exception as e:
return None, f"parse error: {e}"
return scanner, None
def qualifies(filepath, upstream_root):
"""Check if a file qualifies as an importable reftest.
Returns (ok, ref_path_or_None, disqualify_reason).
"""
scanner, err = scan_html_file(filepath)
if err:
return False, None, f"cannot read: {err}"
if not scanner.match_refs:
return False, None, "no <link rel='match'>"
if scanner.has_mismatch:
return False, None, "has mismatch ref (unsupported)"
if scanner.has_script:
return False, None, "contains <script>"
if scanner.has_external_css:
return False, None, "has external stylesheet"
if scanner.has_disqualifying_tag:
return False, None, "has disqualifying HTML tag"
css = scanner.get_css()
if check_css_imports(css):
return False, None, "CSS has @import or external url()"
# Check the first match reference
ref_href = scanner.match_refs[0]
if ref_href.startswith("/"):
# Absolute WPT path - resolve from upstream root
ref_path = (upstream_root / ref_href.lstrip("/")).resolve()
else:
ref_path = (filepath.parent / ref_href).resolve()
if not ref_path.exists():
return False, None, f"reference not found: {ref_href}"
# Validate reference file too
ref_scanner, ref_err = scan_html_file(ref_path)
if ref_err:
return False, None, f"cannot read reference: {ref_err}"
if ref_scanner.has_script:
return False, None, "reference contains <script>"
if ref_scanner.has_external_css:
return False, None, "reference has external stylesheet"
if ref_scanner.has_disqualifying_tag:
return False, None, "reference has disqualifying tag"
ref_css = ref_scanner.get_css()
if check_css_imports(ref_css):
return False, None, "reference CSS has @import or external url()"
return True, ref_path, None
def wpt_path_to_id(wpt_relpath):
"""Convert a WPT relative path to a manifest ID.
css/css-box/margin-001.html -> wpt-css-box-margin-001
"""
stem = Path(wpt_relpath).stem
# Remove file extension, build path-based id
parts = Path(wpt_relpath).parent.parts
# Skip "css/" prefix if present for cleaner IDs, but keep module info
id_parts = []
for p in parts:
id_parts.append(p)
name = "wpt-" + "-".join(id_parts) + "-" + stem
# Sanitize: only alphanumeric and hyphens
name = re.sub(r"[^a-zA-Z0-9-]", "-", name)
# Collapse multiple hyphens
name = re.sub(r"-{2,}", "-", name)
return name.lower().strip("-")
def load_existing_ids():
"""Load existing test IDs from the manifest."""
ids = set()
if not MANIFEST_PATH.exists():
return ids
content = MANIFEST_PATH.read_text()
for line in content.splitlines():
line = line.strip()
if line.startswith("id = "):
# Extract quoted string
m = re.match(r'id\s*=\s*"([^"]+)"', line)
if m:
ids.add(m.group(1))
return ids
def convert_xht_to_html(content):
"""Minimal .xht -> .html cleanup."""
# Remove XML namespace declarations that browsers handle but our parser may not
content = re.sub(r'\s+xmlns(?::[a-z]+)?="[^"]*"', "", content)
# Replace XHTML self-closing tags
content = re.sub(r"<(br|hr|img|input|meta|link)([^>]*)\s*/>", r"<\1\2>", content)
return content
def phase_clone(modules):
"""Phase 1: Sparse-checkout the WPT repo."""
if UPSTREAM_DIR.exists():
print(f" upstream dir exists: {UPSTREAM_DIR}")
# Check if it's a valid git repo with our content
git_dir = UPSTREAM_DIR / ".git"
if git_dir.exists():
print(" updating sparse-checkout patterns...")
dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
dirs.append("css/reference")
dirs.append("css/CSS2/reference")
if dirs:
subprocess.run(
["git", "sparse-checkout", "set", "--no-cone"] + dirs,
cwd=UPSTREAM_DIR,
check=True,
)
print(" done.")
return
else:
print(" not a git repo, removing and re-cloning...")
shutil.rmtree(UPSTREAM_DIR)
print(f" cloning WPT repo (sparse, depth=1)...")
UPSTREAM_DIR.mkdir(parents=True, exist_ok=True)
subprocess.run(
[
"git", "clone",
"--depth", "1",
"--filter=blob:none",
"--sparse",
WPT_REPO,
str(UPSTREAM_DIR),
],
check=True,
)
dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
# Also include common reference directories used by reftests
dirs.append("css/reference")
dirs.append("css/CSS2/reference")
if dirs:
subprocess.run(
["git", "sparse-checkout", "set", "--no-cone"] + dirs,
cwd=UPSTREAM_DIR,
check=True,
)
print(" clone complete.")
def phase_scan(modules):
"""Phase 2: Scan for qualifying reftests. Returns list of candidates."""
candidates = []
stats = {"total_files": 0, "qualified": 0, "disqualified": {}}
for module_key in modules:
if module_key not in MODULE_DIRS:
print(f" warning: unknown module '{module_key}', skipping")
continue
module_path = UPSTREAM_DIR / MODULE_DIRS[module_key]
if not module_path.exists():
print(f" warning: {MODULE_DIRS[module_key]} not found in upstream, skipping")
continue
# Walk for .html and .xht files
for filepath in sorted(module_path.rglob("*")):
if filepath.suffix.lower() not in (".html", ".xht", ".xhtml", ".htm"):
continue
# Skip reference files (commonly named *-ref.html or *-ref.xht)
if "-ref" in filepath.stem or filepath.stem.endswith("ref"):
continue
# Skip support/helper files
if "support" in filepath.parts or "reference" in filepath.parts:
continue
stats["total_files"] += 1
wpt_relpath = filepath.relative_to(UPSTREAM_DIR)
ok, ref_path, reason = qualifies(filepath, UPSTREAM_DIR)
if not ok:
stats["disqualified"][reason] = stats["disqualified"].get(reason, 0) + 1
continue
stats["qualified"] += 1
# Detect CSS features for flags/reason
scanner, _ = scan_html_file(filepath)
css = scanner.get_css() if scanner else ""
ref_scanner, _ = scan_html_file(ref_path)
ref_css = ref_scanner.get_css() if ref_scanner else ""
features = detect_css_features(css + "\n" + ref_css)
candidates.append({
"wpt_relpath": str(wpt_relpath),
"test_path": filepath,
"ref_path": ref_path,
"module": module_key,
"features": features,
"title": scanner.title.strip() if scanner else "",
})
print(f"\n Scan results:")
print(f" Files examined: {stats['total_files']}")
print(f" Qualified: {stats['qualified']}")
print(f" Disqualified: {stats['total_files'] - stats['qualified']}")
if stats["disqualified"]:
print(f" Disqualification reasons:")
for reason, count in sorted(stats["disqualified"].items(), key=lambda x: -x[1]):
print(f" {count:4d} {reason}")
return candidates
def phase_import(candidates, dry_run=False):
"""Phase 3: Copy fixtures and append to manifest."""
existing_ids = load_existing_ids()
imported = 0
skipped_existing = 0
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
manifest_entries = []
for c in candidates:
test_id = wpt_path_to_id(c["wpt_relpath"])
if test_id in existing_ids:
skipped_existing += 1
continue
test_src = c["test_path"]
ref_src = c["ref_path"]
# Determine fixture filenames
test_dest = FIXTURES_DIR / f"{test_id}-test.html"
ref_dest = FIXTURES_DIR / f"{test_id}-ref.html"
# Build reason string
reason_parts = [f"upstream: {c['wpt_relpath']}"]
if c["features"]:
reason_parts.append(f"features: {', '.join(c['features'])}")
reason = "; ".join(reason_parts)
# Build flags
flags = list(set([c["module"]] + c["features"]))
if dry_run:
print(f" [dry-run] would import: {test_id}")
print(f" from: {c['wpt_relpath']}")
print(f" flags: {flags}")
else:
# Copy test file
test_content = test_src.read_text(encoding="utf-8", errors="replace")
if test_src.suffix.lower() in (".xht", ".xhtml"):
test_content = convert_xht_to_html(test_content)
test_dest.write_text(test_content, encoding="utf-8")
# Copy reference file
ref_content = ref_src.read_text(encoding="utf-8", errors="replace")
if ref_src.suffix.lower() in (".xht", ".xhtml"):
ref_content = convert_xht_to_html(ref_content)
ref_dest.write_text(ref_content, encoding="utf-8")
# Build manifest entry
flags_str = ", ".join(f'"{f}"' for f in sorted(flags))
entry = (
f"\n[[case]]\n"
f'id = "{test_id}"\n'
f'input = "fixtures/{test_dest.name}"\n'
f'mode = "reftest"\n'
f'reference = "fixtures/{ref_dest.name}"\n'
f'status = "known_fail"\n'
f'reason = "{reason}"\n'
f"flags = [{flags_str}]\n"
)
manifest_entries.append(entry)
existing_ids.add(test_id)
imported += 1
if manifest_entries and not dry_run:
with open(MANIFEST_PATH, "a") as f:
for entry in manifest_entries:
f.write(entry)
print(f"\n Import results:")
print(f" Imported: {imported}")
print(f" Skipped (exist): {skipped_existing}")
if not dry_run and imported > 0:
print(f" Manifest updated: {MANIFEST_PATH}")
def main():
parser = argparse.ArgumentParser(description="Bulk-import WPT CSS reftests")
parser.add_argument(
"--scan-only",
action="store_true",
help="Only scan and report; do not import",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be imported without writing files",
)
parser.add_argument(
"--modules",
type=str,
default=None,
help="Comma-separated list of module keys to import (default: all)",
)
parser.add_argument(
"--max-tests",
type=int,
default=None,
help="Maximum number of tests to import",
)
args = parser.parse_args()
modules = list(MODULE_DIRS.keys())
if args.modules:
modules = [m.strip() for m in args.modules.split(",")]
unknown = [m for m in modules if m not in MODULE_DIRS]
if unknown:
print(f"Error: unknown modules: {unknown}")
print(f"Available: {list(MODULE_DIRS.keys())}")
sys.exit(1)
print("Phase 1: Clone/update upstream WPT repo")
phase_clone(modules)
print("\nPhase 2: Scan for qualifying reftests")
candidates = phase_scan(modules)
if args.max_tests and len(candidates) > args.max_tests:
print(f"\n Limiting to {args.max_tests} tests (of {len(candidates)} found)")
candidates = candidates[: args.max_tests]
if args.scan_only:
print("\n --scan-only: stopping before import")
print(f"\n Qualified tests by module:")
by_module = {}
for c in candidates:
by_module.setdefault(c["module"], []).append(c)
for module, tests in sorted(by_module.items()):
print(f" {module}: {len(tests)}")
return
print("\nPhase 3: Import reftests")
phase_import(candidates, dry_run=args.dry_run)
print("\nDone.")
if __name__ == "__main__":
main()