Add scripts/import_wpt_reftests.py to sparse-clone the upstream WPT repo and bulk-import qualifying CSS reftests (no JS, no external resources) as known_fail entries. 23 tests already pass and are promoted. The import script is idempotent and exposed via `just import-wpt`. CI now prints the WPT summary (pass=36 known_fail=2877 skip=1) on every run. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
550 lines
18 KiB
Python
550 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Bulk-import WPT CSS reftests into the project's wpt_manifest.toml.
|
|
|
|
Three phases:
|
|
1. Clone: sparse-checkout the upstream WPT repo (minimal download)
|
|
2. Scan: find qualifying reftests (no JS, no external resources)
|
|
3. Import: copy fixtures and append manifest entries as known_fail
|
|
|
|
Usage:
|
|
python3 scripts/import_wpt_reftests.py [--scan-only] [--dry-run] [--modules M] [--max-tests N]
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
|
|
WPT_REPO = "https://github.com/web-platform-tests/wpt.git"
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
UPSTREAM_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "upstream"
|
|
FIXTURES_DIR = PROJECT_ROOT / "tests" / "external" / "wpt" / "fixtures"
|
|
MANIFEST_PATH = PROJECT_ROOT / "tests" / "external" / "wpt" / "wpt_manifest.toml"
|
|
|
|
# CSS directories to sparse-checkout
|
|
MODULE_DIRS = {
|
|
"CSS2-box-display": "css/CSS2/box-display",
|
|
"CSS2-margin-padding-clear": "css/CSS2/margin-padding-clear",
|
|
"CSS2-normal-flow": "css/CSS2/normal-flow",
|
|
"CSS2-positioning": "css/CSS2/positioning",
|
|
"CSS2-floats": "css/CSS2/floats",
|
|
"CSS2-floats-clear": "css/CSS2/floats-clear",
|
|
"css-box": "css/css-box",
|
|
"css-display": "css/css-display",
|
|
"css-backgrounds": "css/css-backgrounds",
|
|
"css-text": "css/css-text",
|
|
"css-flexbox": "css/css-flexbox",
|
|
"css-inline": "css/css-inline",
|
|
"css-tables": "css/css-tables",
|
|
"css-position": "css/css-position",
|
|
}
|
|
|
|
# Tags whose presence disqualifies a test
|
|
DISQUALIFYING_TAGS = frozenset([
|
|
"script", "img", "video", "canvas", "iframe", "object", "embed",
|
|
"svg", "audio", "source", "picture", "math",
|
|
])
|
|
|
|
# CSS patterns that hint at unsupported features (for reason/flags, not disqualifying)
|
|
UNSUPPORTED_CSS_PATTERNS = [
|
|
(r"::before|::after", "pseudo-elements"),
|
|
(r"::first-line|::first-letter", "pseudo-elements"),
|
|
(r":hover|:focus|:active|:visited", "dynamic-pseudo-classes"),
|
|
(r"@media", "media-queries"),
|
|
(r"@keyframes|animation", "animations"),
|
|
(r"@font-face", "font-face"),
|
|
(r"transition\s*:", "transitions"),
|
|
(r"transform\s*:", "transforms"),
|
|
(r"text-decoration", "text-decoration"),
|
|
(r"text-shadow", "text-shadow"),
|
|
(r"box-shadow", "box-shadow"),
|
|
(r"opacity\s*:", "opacity"),
|
|
(r"counter-reset|counter-increment|content\s*:", "generated-content"),
|
|
(r"writing-mode|direction\s*:", "writing-modes"),
|
|
(r"column-count|column-width|columns\s*:", "multi-column"),
|
|
(r"grid-template|grid-area|display\s*:\s*grid", "grid"),
|
|
(r"filter\s*:", "filters"),
|
|
(r"clip-path", "clip-path"),
|
|
(r"outline\s*:", "outline"),
|
|
(r"list-style", "list-style"),
|
|
(r"word-spacing|letter-spacing", "text-spacing"),
|
|
(r"white-space\s*:", "white-space"),
|
|
(r"text-transform", "text-transform"),
|
|
(r"vertical-align", "vertical-align"),
|
|
(r"line-height", "line-height"),
|
|
(r"text-indent", "text-indent"),
|
|
(r"visibility\s*:", "visibility"),
|
|
(r"cursor\s*:", "cursor"),
|
|
(r"table-layout", "table-layout"),
|
|
(r"border-collapse|border-spacing", "table-borders"),
|
|
(r"min-width|max-width|min-height|max-height", "min-max-sizing"),
|
|
(r"overflow\s*:", "overflow"),
|
|
(r"z-index", "z-index"),
|
|
]
|
|
|
|
|
|
class ReftestScanner(HTMLParser):
|
|
"""Extract metadata from a WPT reftest HTML file."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.match_refs = [] # href values from <link rel="match">
|
|
self.has_mismatch = False
|
|
self.has_script = False
|
|
self.has_external_css = False
|
|
self.has_disqualifying_tag = False
|
|
self.style_content = []
|
|
self._in_style = False
|
|
self.title = ""
|
|
self._in_title = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
tag_lower = tag.lower()
|
|
|
|
if tag_lower == "link":
|
|
rel = attrs_dict.get("rel", "").lower()
|
|
href = attrs_dict.get("href", "")
|
|
if rel == "match":
|
|
self.match_refs.append(href)
|
|
elif rel == "mismatch":
|
|
self.has_mismatch = True
|
|
elif rel == "stylesheet":
|
|
self.has_external_css = True
|
|
|
|
if tag_lower in DISQUALIFYING_TAGS:
|
|
if tag_lower == "script":
|
|
self.has_script = True
|
|
else:
|
|
self.has_disqualifying_tag = True
|
|
|
|
if tag_lower == "style":
|
|
self._in_style = True
|
|
|
|
if tag_lower == "title":
|
|
self._in_title = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag.lower() == "style":
|
|
self._in_style = False
|
|
if tag.lower() == "title":
|
|
self._in_title = False
|
|
|
|
def handle_data(self, data):
|
|
if self._in_style:
|
|
self.style_content.append(data)
|
|
if self._in_title:
|
|
self.title += data
|
|
|
|
def get_css(self):
|
|
return "\n".join(self.style_content)
|
|
|
|
|
|
def check_css_imports(css_text):
|
|
"""Return True if CSS has disqualifying @import or url() references."""
|
|
if re.search(r"@import\b", css_text):
|
|
return True
|
|
# Allow data: URLs, disqualify everything else
|
|
for m in re.finditer(r"url\s*\(([^)]*)\)", css_text):
|
|
url_content = m.group(1).strip().strip("'\"")
|
|
if url_content and not url_content.startswith("data:"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def detect_css_features(css_text):
|
|
"""Return list of detected CSS feature flags."""
|
|
flags = set()
|
|
for pattern, flag in UNSUPPORTED_CSS_PATTERNS:
|
|
if re.search(pattern, css_text, re.IGNORECASE):
|
|
flags.add(flag)
|
|
return sorted(flags)
|
|
|
|
|
|
def scan_html_file(filepath):
|
|
"""Scan an HTML file and return (scanner, error_or_None)."""
|
|
try:
|
|
content = filepath.read_text(encoding="utf-8", errors="replace")
|
|
except (OSError, UnicodeDecodeError) as e:
|
|
return None, str(e)
|
|
|
|
scanner = ReftestScanner()
|
|
try:
|
|
scanner.feed(content)
|
|
except Exception as e:
|
|
return None, f"parse error: {e}"
|
|
|
|
return scanner, None
|
|
|
|
|
|
def qualifies(filepath, upstream_root):
|
|
"""Check if a file qualifies as an importable reftest.
|
|
|
|
Returns (ok, ref_path_or_None, disqualify_reason).
|
|
"""
|
|
scanner, err = scan_html_file(filepath)
|
|
if err:
|
|
return False, None, f"cannot read: {err}"
|
|
|
|
if not scanner.match_refs:
|
|
return False, None, "no <link rel='match'>"
|
|
|
|
if scanner.has_mismatch:
|
|
return False, None, "has mismatch ref (unsupported)"
|
|
|
|
if scanner.has_script:
|
|
return False, None, "contains <script>"
|
|
|
|
if scanner.has_external_css:
|
|
return False, None, "has external stylesheet"
|
|
|
|
if scanner.has_disqualifying_tag:
|
|
return False, None, "has disqualifying HTML tag"
|
|
|
|
css = scanner.get_css()
|
|
if check_css_imports(css):
|
|
return False, None, "CSS has @import or external url()"
|
|
|
|
# Check the first match reference
|
|
ref_href = scanner.match_refs[0]
|
|
if ref_href.startswith("/"):
|
|
# Absolute WPT path - resolve from upstream root
|
|
ref_path = (upstream_root / ref_href.lstrip("/")).resolve()
|
|
else:
|
|
ref_path = (filepath.parent / ref_href).resolve()
|
|
|
|
if not ref_path.exists():
|
|
return False, None, f"reference not found: {ref_href}"
|
|
|
|
# Validate reference file too
|
|
ref_scanner, ref_err = scan_html_file(ref_path)
|
|
if ref_err:
|
|
return False, None, f"cannot read reference: {ref_err}"
|
|
|
|
if ref_scanner.has_script:
|
|
return False, None, "reference contains <script>"
|
|
|
|
if ref_scanner.has_external_css:
|
|
return False, None, "reference has external stylesheet"
|
|
|
|
if ref_scanner.has_disqualifying_tag:
|
|
return False, None, "reference has disqualifying tag"
|
|
|
|
ref_css = ref_scanner.get_css()
|
|
if check_css_imports(ref_css):
|
|
return False, None, "reference CSS has @import or external url()"
|
|
|
|
return True, ref_path, None
|
|
|
|
|
|
def wpt_path_to_id(wpt_relpath):
|
|
"""Convert a WPT relative path to a manifest ID.
|
|
|
|
css/css-box/margin-001.html -> wpt-css-box-margin-001
|
|
"""
|
|
stem = Path(wpt_relpath).stem
|
|
# Remove file extension, build path-based id
|
|
parts = Path(wpt_relpath).parent.parts
|
|
# Skip "css/" prefix if present for cleaner IDs, but keep module info
|
|
id_parts = []
|
|
for p in parts:
|
|
id_parts.append(p)
|
|
name = "wpt-" + "-".join(id_parts) + "-" + stem
|
|
# Sanitize: only alphanumeric and hyphens
|
|
name = re.sub(r"[^a-zA-Z0-9-]", "-", name)
|
|
# Collapse multiple hyphens
|
|
name = re.sub(r"-{2,}", "-", name)
|
|
return name.lower().strip("-")
|
|
|
|
|
|
def load_existing_ids():
|
|
"""Load existing test IDs from the manifest."""
|
|
ids = set()
|
|
if not MANIFEST_PATH.exists():
|
|
return ids
|
|
content = MANIFEST_PATH.read_text()
|
|
for line in content.splitlines():
|
|
line = line.strip()
|
|
if line.startswith("id = "):
|
|
# Extract quoted string
|
|
m = re.match(r'id\s*=\s*"([^"]+)"', line)
|
|
if m:
|
|
ids.add(m.group(1))
|
|
return ids
|
|
|
|
|
|
def convert_xht_to_html(content):
|
|
"""Minimal .xht -> .html cleanup."""
|
|
# Remove XML namespace declarations that browsers handle but our parser may not
|
|
content = re.sub(r'\s+xmlns(?::[a-z]+)?="[^"]*"', "", content)
|
|
# Replace XHTML self-closing tags
|
|
content = re.sub(r"<(br|hr|img|input|meta|link)([^>]*)\s*/>", r"<\1\2>", content)
|
|
return content
|
|
|
|
|
|
def phase_clone(modules):
|
|
"""Phase 1: Sparse-checkout the WPT repo."""
|
|
if UPSTREAM_DIR.exists():
|
|
print(f" upstream dir exists: {UPSTREAM_DIR}")
|
|
# Check if it's a valid git repo with our content
|
|
git_dir = UPSTREAM_DIR / ".git"
|
|
if git_dir.exists():
|
|
print(" updating sparse-checkout patterns...")
|
|
dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
|
|
dirs.append("css/reference")
|
|
dirs.append("css/CSS2/reference")
|
|
if dirs:
|
|
subprocess.run(
|
|
["git", "sparse-checkout", "set", "--no-cone"] + dirs,
|
|
cwd=UPSTREAM_DIR,
|
|
check=True,
|
|
)
|
|
print(" done.")
|
|
return
|
|
else:
|
|
print(" not a git repo, removing and re-cloning...")
|
|
shutil.rmtree(UPSTREAM_DIR)
|
|
|
|
print(f" cloning WPT repo (sparse, depth=1)...")
|
|
UPSTREAM_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
subprocess.run(
|
|
[
|
|
"git", "clone",
|
|
"--depth", "1",
|
|
"--filter=blob:none",
|
|
"--sparse",
|
|
WPT_REPO,
|
|
str(UPSTREAM_DIR),
|
|
],
|
|
check=True,
|
|
)
|
|
|
|
dirs = [MODULE_DIRS[m] for m in modules if m in MODULE_DIRS]
|
|
# Also include common reference directories used by reftests
|
|
dirs.append("css/reference")
|
|
dirs.append("css/CSS2/reference")
|
|
if dirs:
|
|
subprocess.run(
|
|
["git", "sparse-checkout", "set", "--no-cone"] + dirs,
|
|
cwd=UPSTREAM_DIR,
|
|
check=True,
|
|
)
|
|
|
|
print(" clone complete.")
|
|
|
|
|
|
def phase_scan(modules):
|
|
"""Phase 2: Scan for qualifying reftests. Returns list of candidates."""
|
|
candidates = []
|
|
stats = {"total_files": 0, "qualified": 0, "disqualified": {}}
|
|
|
|
for module_key in modules:
|
|
if module_key not in MODULE_DIRS:
|
|
print(f" warning: unknown module '{module_key}', skipping")
|
|
continue
|
|
|
|
module_path = UPSTREAM_DIR / MODULE_DIRS[module_key]
|
|
if not module_path.exists():
|
|
print(f" warning: {MODULE_DIRS[module_key]} not found in upstream, skipping")
|
|
continue
|
|
|
|
# Walk for .html and .xht files
|
|
for filepath in sorted(module_path.rglob("*")):
|
|
if filepath.suffix.lower() not in (".html", ".xht", ".xhtml", ".htm"):
|
|
continue
|
|
|
|
# Skip reference files (commonly named *-ref.html or *-ref.xht)
|
|
if "-ref" in filepath.stem or filepath.stem.endswith("ref"):
|
|
continue
|
|
|
|
# Skip support/helper files
|
|
if "support" in filepath.parts or "reference" in filepath.parts:
|
|
continue
|
|
|
|
stats["total_files"] += 1
|
|
|
|
wpt_relpath = filepath.relative_to(UPSTREAM_DIR)
|
|
ok, ref_path, reason = qualifies(filepath, UPSTREAM_DIR)
|
|
|
|
if not ok:
|
|
stats["disqualified"][reason] = stats["disqualified"].get(reason, 0) + 1
|
|
continue
|
|
|
|
stats["qualified"] += 1
|
|
|
|
# Detect CSS features for flags/reason
|
|
scanner, _ = scan_html_file(filepath)
|
|
css = scanner.get_css() if scanner else ""
|
|
ref_scanner, _ = scan_html_file(ref_path)
|
|
ref_css = ref_scanner.get_css() if ref_scanner else ""
|
|
features = detect_css_features(css + "\n" + ref_css)
|
|
|
|
candidates.append({
|
|
"wpt_relpath": str(wpt_relpath),
|
|
"test_path": filepath,
|
|
"ref_path": ref_path,
|
|
"module": module_key,
|
|
"features": features,
|
|
"title": scanner.title.strip() if scanner else "",
|
|
})
|
|
|
|
print(f"\n Scan results:")
|
|
print(f" Files examined: {stats['total_files']}")
|
|
print(f" Qualified: {stats['qualified']}")
|
|
print(f" Disqualified: {stats['total_files'] - stats['qualified']}")
|
|
if stats["disqualified"]:
|
|
print(f" Disqualification reasons:")
|
|
for reason, count in sorted(stats["disqualified"].items(), key=lambda x: -x[1]):
|
|
print(f" {count:4d} {reason}")
|
|
|
|
return candidates
|
|
|
|
|
|
def phase_import(candidates, dry_run=False):
|
|
"""Phase 3: Copy fixtures and append to manifest."""
|
|
existing_ids = load_existing_ids()
|
|
imported = 0
|
|
skipped_existing = 0
|
|
|
|
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
manifest_entries = []
|
|
|
|
for c in candidates:
|
|
test_id = wpt_path_to_id(c["wpt_relpath"])
|
|
|
|
if test_id in existing_ids:
|
|
skipped_existing += 1
|
|
continue
|
|
|
|
test_src = c["test_path"]
|
|
ref_src = c["ref_path"]
|
|
|
|
# Determine fixture filenames
|
|
test_dest = FIXTURES_DIR / f"{test_id}-test.html"
|
|
ref_dest = FIXTURES_DIR / f"{test_id}-ref.html"
|
|
|
|
# Build reason string
|
|
reason_parts = [f"upstream: {c['wpt_relpath']}"]
|
|
if c["features"]:
|
|
reason_parts.append(f"features: {', '.join(c['features'])}")
|
|
reason = "; ".join(reason_parts)
|
|
|
|
# Build flags
|
|
flags = list(set([c["module"]] + c["features"]))
|
|
|
|
if dry_run:
|
|
print(f" [dry-run] would import: {test_id}")
|
|
print(f" from: {c['wpt_relpath']}")
|
|
print(f" flags: {flags}")
|
|
else:
|
|
# Copy test file
|
|
test_content = test_src.read_text(encoding="utf-8", errors="replace")
|
|
if test_src.suffix.lower() in (".xht", ".xhtml"):
|
|
test_content = convert_xht_to_html(test_content)
|
|
test_dest.write_text(test_content, encoding="utf-8")
|
|
|
|
# Copy reference file
|
|
ref_content = ref_src.read_text(encoding="utf-8", errors="replace")
|
|
if ref_src.suffix.lower() in (".xht", ".xhtml"):
|
|
ref_content = convert_xht_to_html(ref_content)
|
|
ref_dest.write_text(ref_content, encoding="utf-8")
|
|
|
|
# Build manifest entry
|
|
flags_str = ", ".join(f'"{f}"' for f in sorted(flags))
|
|
entry = (
|
|
f"\n[[case]]\n"
|
|
f'id = "{test_id}"\n'
|
|
f'input = "fixtures/{test_dest.name}"\n'
|
|
f'mode = "reftest"\n'
|
|
f'reference = "fixtures/{ref_dest.name}"\n'
|
|
f'status = "known_fail"\n'
|
|
f'reason = "{reason}"\n'
|
|
f"flags = [{flags_str}]\n"
|
|
)
|
|
manifest_entries.append(entry)
|
|
|
|
existing_ids.add(test_id)
|
|
imported += 1
|
|
|
|
if manifest_entries and not dry_run:
|
|
with open(MANIFEST_PATH, "a") as f:
|
|
for entry in manifest_entries:
|
|
f.write(entry)
|
|
|
|
print(f"\n Import results:")
|
|
print(f" Imported: {imported}")
|
|
print(f" Skipped (exist): {skipped_existing}")
|
|
if not dry_run and imported > 0:
|
|
print(f" Manifest updated: {MANIFEST_PATH}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Bulk-import WPT CSS reftests")
|
|
parser.add_argument(
|
|
"--scan-only",
|
|
action="store_true",
|
|
help="Only scan and report; do not import",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be imported without writing files",
|
|
)
|
|
parser.add_argument(
|
|
"--modules",
|
|
type=str,
|
|
default=None,
|
|
help="Comma-separated list of module keys to import (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--max-tests",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum number of tests to import",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
modules = list(MODULE_DIRS.keys())
|
|
if args.modules:
|
|
modules = [m.strip() for m in args.modules.split(",")]
|
|
unknown = [m for m in modules if m not in MODULE_DIRS]
|
|
if unknown:
|
|
print(f"Error: unknown modules: {unknown}")
|
|
print(f"Available: {list(MODULE_DIRS.keys())}")
|
|
sys.exit(1)
|
|
|
|
print("Phase 1: Clone/update upstream WPT repo")
|
|
phase_clone(modules)
|
|
|
|
print("\nPhase 2: Scan for qualifying reftests")
|
|
candidates = phase_scan(modules)
|
|
|
|
if args.max_tests and len(candidates) > args.max_tests:
|
|
print(f"\n Limiting to {args.max_tests} tests (of {len(candidates)} found)")
|
|
candidates = candidates[: args.max_tests]
|
|
|
|
if args.scan_only:
|
|
print("\n --scan-only: stopping before import")
|
|
print(f"\n Qualified tests by module:")
|
|
by_module = {}
|
|
for c in candidates:
|
|
by_module.setdefault(c["module"], []).append(c)
|
|
for module, tests in sorted(by_module.items()):
|
|
print(f" {module}: {len(tests)}")
|
|
return
|
|
|
|
print("\nPhase 3: Import reftests")
|
|
phase_import(candidates, dry_run=args.dry_run)
|
|
|
|
print("\nDone.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|