v2: automated BIOS platform with full pipeline

Reorganized 6 branches into bios/Manufacturer/Console/. Scrapers for RetroArch, Batocera, Recalbox, and libretro core-info. Platform-aware verification replicating native logic per platform. Pack generation with dedup, alias resolution, variant support. CI/CD: weekly auto-scrape, auto-release, PR validation. Large files (>50MB) stored as GitHub Release assets, auto-fetched at build time.
2026-06-29 22:02:48 +00:00 · 2026-03-17 10:54:39 +01:00 · 2026-03-17 10:54:39 +01:00 · 13c561888d
commit 13c561888d
parent 5f96368f6d
7038 changed files with 3243612 additions and 29617 deletions
--- a/scripts/scraper/init.py
+++ b/scripts/scraper/init.py
@ -0,0 +1,46 @@
+"""Scraper plugin discovery module.
+
+Auto-detects *_scraper.py files and exposes their scrapers.
+Each scraper module must define:
+    PLATFORM_NAME: str
+    Scraper: class inheriting BaseScraper
+"""
+
+from __future__ import annotations
+
+import importlib
+import pkgutil
+from pathlib import Path
+
+from .base_scraper import BaseScraper
+
+_scrapers: dict[str, type] = {}
+
+
+def discover_scrapers() -> dict[str, type]:
+    """Auto-discover all *_scraper.py modules and return {platform_name: ScraperClass}."""
+    if _scrapers:
+        return _scrapers
+
+    package_dir = Path(__file__).parent
+
+    for finder, name, ispkg in pkgutil.iter_modules([str(package_dir)]):
+        if not name.endswith("_scraper"):
+            continue
+
+        module = importlib.import_module(f".{name}", package=__package__)
+
+        platform_name = getattr(module, "PLATFORM_NAME", None)
+        scraper_class = getattr(module, "Scraper", None)
+
+        if platform_name and scraper_class and issubclass(scraper_class, BaseScraper):
+            _scrapers[platform_name] = scraper_class
+
+    return _scrapers
+
+
+def get_scraper(platform_name: str) -> BaseScraper | None:
+    """Get an instantiated scraper for a platform."""
+    scrapers = discover_scrapers()
+    cls = scrapers.get(platform_name)
+    return cls() if cls else None
--- a/scripts/scraper/base_scraper.py
+++ b/scripts/scraper/base_scraper.py
@ -0,0 +1,155 @@
+"""Base scraper interface for platform BIOS requirement sources."""
+
+from __future__ import annotations
+
+import json
+import urllib.request
+import urllib.error
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class BiosRequirement:
+    """A single BIOS file requirement from a platform source."""
+    name: str
+    system: str
+    sha1: str | None = None
+    md5: str | None = None
+    crc32: str | None = None
+    size: int | None = None
+    destination: str = ""
+    required: bool = True
+    zipped_file: str | None = None  # If set, md5 is for this ROM inside the ZIP
+
+
+@dataclass
+class ChangeSet:
+    """Differences between scraped requirements and current config."""
+    added: list[BiosRequirement] = field(default_factory=list)
+    removed: list[BiosRequirement] = field(default_factory=list)
+    modified: list[tuple[BiosRequirement, BiosRequirement]] = field(default_factory=list)
+
+    @property
+    def has_changes(self) -> bool:
+        return bool(self.added or self.removed or self.modified)
+
+    def summary(self) -> str:
+        parts = []
+        if self.added:
+            parts.append(f"+{len(self.added)} added")
+        if self.removed:
+            parts.append(f"-{len(self.removed)} removed")
+        if self.modified:
+            parts.append(f"~{len(self.modified)} modified")
+        return ", ".join(parts) if parts else "no changes"
+
+
+class BaseScraper(ABC):
+    """Abstract base class for platform BIOS requirement scrapers."""
+
+    @abstractmethod
+    def fetch_requirements(self) -> list[BiosRequirement]:
+        """Fetch current BIOS requirements from the platform source."""
+        ...
+
+    def compare_with_config(self, config: dict) -> ChangeSet:
+        """Compare scraped requirements against existing platform config."""
+        scraped = self.fetch_requirements()
+        changes = ChangeSet()
+
+        existing = {}
+        for sys_id, system in config.get("systems", {}).items():
+            for f in system.get("files", []):
+                key = (sys_id, f["name"])
+                existing[key] = f
+
+        scraped_map = {}
+        for req in scraped:
+            key = (req.system, req.name)
+            scraped_map[key] = req
+
+        for key, req in scraped_map.items():
+            if key not in existing:
+                changes.added.append(req)
+            else:
+                existing_file = existing[key]
+                if req.sha1 and existing_file.get("sha1") and req.sha1 != existing_file["sha1"]:
+                    changes.modified.append((
+                        BiosRequirement(
+                            name=existing_file["name"],
+                            system=key[0],
+                            sha1=existing_file.get("sha1"),
+                            md5=existing_file.get("md5"),
+                        ),
+                        req,
+                    ))
+                elif req.md5 and existing_file.get("md5") and req.md5 != existing_file["md5"]:
+                    changes.modified.append((
+                        BiosRequirement(
+                            name=existing_file["name"],
+                            system=key[0],
+                            md5=existing_file.get("md5"),
+                        ),
+                        req,
+                    ))
+
+        for key in existing:
+            if key not in scraped_map:
+                f = existing[key]
+                changes.removed.append(BiosRequirement(
+                    name=f["name"],
+                    system=key[0],
+                    sha1=f.get("sha1"),
+                    md5=f.get("md5"),
+                ))
+
+        return changes
+
+    def test_connection(self) -> bool:
+        """Test if the source URL is reachable."""
+        try:
+            self.fetch_requirements()
+            return True
+        except Exception:
+            return False
+
+    @abstractmethod
+    def validate_format(self, raw_data: str) -> bool:
+        """Validate source data format. Returns False if format has changed unexpectedly."""
+        ...
+
+
+def fetch_github_latest_version(repo: str) -> str | None:
+    """Fetch the latest release version tag from a GitHub repo."""
+    url = f"https://api.github.com/repos/{repo}/releases/latest"
+    try:
+        req = urllib.request.Request(url, headers={
+            "User-Agent": "retrobios-scraper/1.0",
+            "Accept": "application/vnd.github.v3+json",
+        })
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            data = json.loads(resp.read())
+            return data.get("tag_name", "")
+    except (urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError):
+        return None
+
+
+def fetch_github_latest_tag(repo: str, prefix: str = "") -> str | None:
+    """Fetch the most recent matching tag from a GitHub repo."""
+    url = f"https://api.github.com/repos/{repo}/tags?per_page=50"
+    try:
+        req = urllib.request.Request(url, headers={
+            "User-Agent": "retrobios-scraper/1.0",
+            "Accept": "application/vnd.github.v3+json",
+        })
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            tags = json.loads(resp.read())
+            for tag in tags:
+                name = tag["name"]
+                if prefix and not name.startswith(prefix):
+                    continue
+                return name
+            return tags[0]["name"] if tags else None
+    except (urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError):
+        return None
--- a/scripts/scraper/batocera_scraper.py
+++ b/scripts/scraper/batocera_scraper.py
@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""Scraper for Batocera batocera-systems.
+
+Source: https://github.com/batocera-linux/batocera.linux/.../batocera-systems
+Format: Python dict with systems -> biosFiles
+Hash: MD5 primary
+"""
+
+from __future__ import annotations
+
+import ast
+import re
+import sys
+import urllib.request
+import urllib.error
+
+from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
+
+PLATFORM_NAME = "batocera"
+
+SOURCE_URL = (
+    "https://raw.githubusercontent.com/batocera-linux/batocera.linux/"
+    "master/package/batocera/core/batocera-scripts/scripts/batocera-systems"
+)
+
+SYSTEM_SLUG_MAP = {
+    "atari800": "atari-400-800",
+    "atari5200": "atari-5200",
+    "atarist": "atari-st",
+    "lynx": "atari-lynx",
+    "3do": "3do",
+    "amiga": "commodore-amiga",
+    "amiga600": "commodore-amiga",
+    "amiga1200": "commodore-amiga",
+    "amigacd32": "commodore-amiga",
+    "amigacdtv": "commodore-amiga",
+    "c128": "commodore-c128",
+    "colecovision": "coleco-colecovision",
+    "dreamcast": "sega-dreamcast",
+    "naomi": "sega-dreamcast-arcade",
+    "naomi2": "sega-dreamcast-arcade",
+    "atomiswave": "sega-dreamcast-arcade",
+    "fds": "nintendo-fds",
+    "gamecube": "nintendo-gamecube",
+    "gb": "nintendo-gb",
+    "gba": "nintendo-gba",
+    "gbc": "nintendo-gbc",
+    "nds": "nintendo-ds",
+    "n64dd": "nintendo-64dd",
+    "satellaview": "nintendo-satellaview",
+    "sgb": "nintendo-sgb",
+    "snes": "nintendo-snes",
+    "channelf": "fairchild-channel-f",
+    "intellivision": "mattel-intellivision",
+    "msx": "microsoft-msx",
+    "msx1": "microsoft-msx",
+    "msx2": "microsoft-msx",
+    "msxturbor": "microsoft-msx",
+    "neogeo": "snk-neogeo",
+    "neogeocd": "snk-neogeo-cd",
+    "odyssey2": "magnavox-odyssey2",
+    "pcengine": "nec-pc-engine",
+    "pcenginecd": "nec-pc-engine",
+    "supergrafx": "nec-pc-engine",
+    "pc88": "nec-pc-88",
+    "pc98": "nec-pc-98",
+    "pcfx": "nec-pc-fx",
+    "psx": "sony-playstation",
+    "ps2": "sony-playstation-2",
+    "psp": "sony-psp",
+    "saturn": "sega-saturn",
+    "segacd": "sega-mega-cd",
+    "mastersystem": "sega-master-system",
+    "megadrive": "sega-mega-drive",
+    "gamegear": "sega-game-gear",
+    "x1": "sharp-x1",
+    "x68000": "sharp-x68000",
+    "zxspectrum": "sinclair-zx-spectrum",
+    "scummvm": "scummvm",
+    "doom": "doom",
+    "macintosh": "apple-macintosh-ii",
+    "dos": "dos",
+    "videopac": "philips-videopac",
+    "pokemini": "nintendo-pokemon-mini",
+}
+
+
+class Scraper(BaseScraper):
+    """Scraper for batocera-systems Python dict."""
+
+    def __init__(self, url: str = SOURCE_URL):
+        self.url = url
+        self._raw_data: str | None = None
+
+    def _fetch_raw(self) -> str:
+        if self._raw_data is not None:
+            return self._raw_data
+
+        try:
+            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                self._raw_data = resp.read().decode("utf-8")
+                return self._raw_data
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
+
+    def _extract_systems_dict(self, raw: str) -> dict:
+        """Extract and parse the 'systems' dict from the Python source via ast.literal_eval."""
+        match = re.search(r'^systems\s*=\s*\{', raw, re.MULTILINE)
+        if not match:
+            raise ValueError("Could not find 'systems = {' in batocera-systems")
+
+        start = match.start() + raw[match.start():].index("{")
+        depth = 0
+        i = start
+        while i < len(raw):
+            if raw[i] == "{":
+                depth += 1
+            elif raw[i] == "}":
+                depth -= 1
+                if depth == 0:
+                    break
+            elif raw[i] == "#":
+                while i < len(raw) and raw[i] != "\n":
+                    i += 1
+            i += 1
+
+        dict_str = raw[start:i + 1]
+
+        lines = []
+        for line in dict_str.split("\n"):
+            in_string = False
+            string_char = None
+            clean = []
+            for j, ch in enumerate(line):
+                if ch in ('"', "'") and not in_string:
+                    in_string = True
+                    string_char = ch
+                    clean.append(ch)
+                elif ch == string_char and in_string:
+                    in_string = False
+                    clean.append(ch)
+                elif ch == "#" and not in_string:
+                    break
+                else:
+                    clean.append(ch)
+            lines.append("".join(clean))
+
+        clean_dict_str = "\n".join(lines)
+
+        clean_dict_str = clean_dict_str.replace("OrderedDict(", "dict(")
+
+        try:
+            return ast.literal_eval(clean_dict_str)
+        except (SyntaxError, ValueError) as e:
+            raise ValueError(f"Failed to parse systems dict: {e}") from e
+
+    def fetch_requirements(self) -> list[BiosRequirement]:
+        """Parse batocera-systems and return BIOS requirements."""
+        raw = self._fetch_raw()
+
+        if not self.validate_format(raw):
+            raise ValueError("batocera-systems format validation failed")
+
+        systems = self._extract_systems_dict(raw)
+        requirements = []
+
+        for sys_key, sys_data in systems.items():
+            system_slug = SYSTEM_SLUG_MAP.get(sys_key, sys_key)
+            bios_files = sys_data.get("biosFiles", [])
+
+            for bios in bios_files:
+                file_path = bios.get("file", "")
+                md5 = bios.get("md5", "")
+                zipped_file = bios.get("zippedFile", "")
+
+                if file_path.startswith("bios/"):
+                    file_path = file_path[5:]
+
+                name = file_path.split("/")[-1] if "/" in file_path else file_path
+
+                requirements.append(BiosRequirement(
+                    name=name,
+                    system=system_slug,
+                    md5=md5 or None,
+                    destination=file_path,
+                    required=True,
+                    zipped_file=zipped_file or None,
+                ))
+
+        return requirements
+
+    def validate_format(self, raw_data: str) -> bool:
+        """Validate batocera-systems format."""
+        has_systems = "systems" in raw_data and "biosFiles" in raw_data
+        has_dict = re.search(r'^systems\s*=\s*\{', raw_data, re.MULTILINE) is not None
+        has_md5 = '"md5"' in raw_data
+        has_file = '"file"' in raw_data
+        return has_systems and has_dict and has_md5 and has_file
+
+    def generate_platform_yaml(self) -> dict:
+        """Generate a platform YAML config dict from scraped data."""
+        requirements = self.fetch_requirements()
+
+        systems = {}
+        for req in requirements:
+            if req.system not in systems:
+                systems[req.system] = {"files": []}
+
+            entry = {
+                "name": req.name,
+                "destination": req.destination,
+                "required": req.required,
+            }
+            if req.md5:
+                entry["md5"] = req.md5
+            if req.zipped_file:
+                entry["zipped_file"] = req.zipped_file
+
+            systems[req.system]["files"].append(entry)
+
+        # Sort numerically since API returns by commit date, not version
+        import json as _json
+        batocera_version = ""
+        try:
+            _url = "https://api.github.com/repos/batocera-linux/batocera.linux/tags?per_page=50"
+            _req = urllib.request.Request(_url, headers={
+                "User-Agent": "retrobios-scraper/1.0",
+                "Accept": "application/vnd.github.v3+json",
+            })
+            with urllib.request.urlopen(_req, timeout=15) as _resp:
+                _tags = _json.loads(_resp.read())
+            _versions = []
+            for _t in _tags:
+                _name = _t["name"]
+                if _name.startswith("batocera-"):
+                    _num = _name.replace("batocera-", "")
+                    if _num.isdigit():
+                        _versions.append(int(_num))
+            if _versions:
+                batocera_version = str(max(_versions))
+        except Exception:
+            pass
+
+        return {
+            "platform": "Batocera",
+            "version": batocera_version or "",
+            "homepage": "https://batocera.org",
+            "source": SOURCE_URL,
+            "base_destination": "bios",
+            "hash_type": "md5",
+            "verification_mode": "md5",
+            "systems": systems,
+        }
+
+
+def main():
+    """CLI entry point for testing."""
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Scrape batocera-systems")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--json", action="store_true")
+    parser.add_argument("--output", "-o")
+    args = parser.parse_args()
+
+    scraper = Scraper()
+
+    try:
+        reqs = scraper.fetch_requirements()
+    except (ConnectionError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.dry_run:
+        by_system = {}
+        for req in reqs:
+            by_system.setdefault(req.system, []).append(req)
+
+        for system, files in sorted(by_system.items()):
+            print(f"\n{system} ({len(files)} files):")
+            for f in files:
+                hash_info = f.md5[:12] if f.md5 else "no-hash"
+                print(f"  {f.name} ({hash_info}...)")
+
+        print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
+        return
+
+    if args.json:
+        config = scraper.generate_platform_yaml()
+        print(json.dumps(config, indent=2))
+        return
+
+    if args.output:
+        try:
+            import yaml
+        except ImportError:
+            print("Error: PyYAML required", file=sys.stderr)
+            sys.exit(1)
+
+        config = scraper.generate_platform_yaml()
+        with open(args.output, "w") as f:
+            yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        print(f"Written to {args.output}")
+    else:
+        reqs = scraper.fetch_requirements()
+        by_system = {}
+        for req in reqs:
+            by_system.setdefault(req.system, []).append(req)
+        print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/scraper/coreinfo_scraper.py
+++ b/scripts/scraper/coreinfo_scraper.py
@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+"""Scraper for libretro-core-info firmware declarations.
+
+Source: https://github.com/libretro/libretro-core-info
+Format: .info files with firmware0_path, firmware0_desc, firmware0_opt patterns
+Hash: From notes field (MD5) or cross-referenced with System.dat
+
+Complements libretro_scraper (System.dat) with:
+- Exact firmware paths per core
+- Required vs optional status
+- Firmware for cores not covered by System.dat
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+import urllib.request
+import urllib.error
+import json
+
+try:
+    from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
+except ImportError:
+    # Allow running directly: python scripts/scraper/coreinfo_scraper.py
+    import os
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+    from scraper.base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
+
+PLATFORM_NAME = "libretro_coreinfo"
+
+GITHUB_API = "https://api.github.com/repos/libretro/libretro-core-info"
+RAW_BASE = "https://raw.githubusercontent.com/libretro/libretro-core-info/master"
+
+CORE_SYSTEM_MAP = {
+    "pcsx_rearmed": "sony-playstation",
+    "mednafen_psx": "sony-playstation",
+    "mednafen_psx_hw": "sony-playstation",
+    "swanstation": "sony-playstation",
+    "duckstation": "sony-playstation",
+    "pcsx1": "sony-playstation",
+    "lrps2": "sony-playstation-2",
+    "play": "sony-playstation-2",
+    "ppsspp": "sony-psp",
+    "fbneo": "arcade",
+    "mame": "arcade",
+    "mame2003": "arcade",
+    "mame2003_plus": "arcade",
+    "dolphin": "nintendo-gamecube",
+    "melonds": "nintendo-ds",
+    "melonds_ds": "nintendo-ds",
+    "desmume": "nintendo-ds",
+    "mgba": "nintendo-gba",
+    "vba_next": "nintendo-gba",
+    "gpsp": "nintendo-gba",
+    "gambatte": "nintendo-gb",
+    "sameboy": "nintendo-gb",
+    "gearboy": "nintendo-gb",
+    "bsnes": "nintendo-snes",
+    "snes9x": "nintendo-snes",
+    "higan_sfc": "nintendo-snes",
+    "mesen-s": "nintendo-snes",
+    "nestopia": "nintendo-nes",
+    "fceumm": "nintendo-nes",
+    "mesen": "nintendo-nes",
+    "mupen64plus_next": "nintendo-64",
+    "parallel_n64": "nintendo-64",
+    "flycast": "sega-dreamcast",
+    "reicast": "sega-dreamcast",
+    "kronos": "sega-saturn",
+    "mednafen_saturn": "sega-saturn",
+    "yabause": "sega-saturn",
+    "genesis_plus_gx": "sega-mega-drive",
+    "picodrive": "sega-mega-drive",
+    "mednafen_pce": "nec-pc-engine",
+    "mednafen_pce_fast": "nec-pc-engine",
+    "mednafen_pcfx": "nec-pc-fx",
+    "mednafen_ngp": "snk-neogeo-pocket",
+    "mednafen_lynx": "atari-lynx",
+    "handy": "atari-lynx",
+    "hatari": "atari-st",
+    "puae": "commodore-amiga",
+    "fuse": "sinclair-zx-spectrum",
+    "dosbox_pure": "dos",
+    "dosbox_svn": "dos",
+    "scummvm": "scummvm",
+    "opera": "3do",
+    "4do": "3do",
+    "ep128emu": "enterprise-64-128",
+    "freej2me": "j2me",
+    "squirreljme": "j2me",
+    "numero": "ti-83",
+    "neocd": "snk-neogeo-cd",
+    "vice_x64": "commodore-c64",
+    "vice_x128": "commodore-c128",
+    "cap32": "amstrad-cpc",
+    "o2em": "magnavox-odyssey2",
+    "vecx": "vectrex",
+    "virtualjaguar": "atari-jaguar",
+    "prosystem": "atari-7800",
+    "stella": "atari-2600",
+    "a5200": "atari-5200",
+    "bluemsx": "microsoft-msx",
+    "fmsx": "microsoft-msx",
+    "px68k": "sharp-x68000",
+    "x1": "sharp-x1",
+    "quasi88": "nec-pc-88",
+    "np2kai": "nec-pc-98",
+    "theodore": "thomson",
+    "81": "sinclair-zx81",
+    "crocods": "amstrad-cpc",
+    "dinothawr": "dinothawr",
+}
+
+
+def _parse_info_file(content: str) -> dict:
+    """Parse a .info file into a dictionary."""
+    result = {}
+    for line in content.split("\n"):
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        match = re.match(r'^(\w+)\s*=\s*"?(.*?)"?\s*$', line)
+        if match:
+            key, value = match.group(1), match.group(2)
+            result[key] = value
+    return result
+
+
+_SKIP_EXTENSIONS = {".dll", ".so", ".dylib", ".exe", ".bat", ".sh"}
+_DIRECTORY_MARKERS = {"folder", "directory", "dir"}
+
+
+def _is_directory_ref(path: str, desc: str) -> bool:
+    """Check if a firmware entry is a directory reference rather than a file."""
+    if "." not in path.split("/")[-1]:
+        return True
+    desc_lower = desc.lower()
+    return any(marker in desc_lower for marker in _DIRECTORY_MARKERS)
+
+
+def _is_native_lib(path: str) -> bool:
+    """Check if path is a native library (.dll, .so, .dylib) - not a BIOS."""
+    ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
+    return ext.lower() in _SKIP_EXTENSIONS
+
+
+def _extract_firmware(info: dict) -> list[dict]:
+    """Extract firmware entries, filtering out directories and native libraries."""
+    count_str = info.get("firmware_count", "0")
+    try:
+        count = int(count_str)
+    except ValueError:
+        return []
+
+    firmware = []
+    for i in range(count):
+        path = info.get(f"firmware{i}_path", "")
+        desc = info.get(f"firmware{i}_desc", "")
+        opt = info.get(f"firmware{i}_opt", "false")
+
+        if not path:
+            continue
+
+        if _is_directory_ref(path, desc):
+            continue
+
+        if _is_native_lib(path):
+            continue
+
+        firmware.append({
+            "path": path,
+            "desc": desc,
+            "optional": opt.lower() == "true",
+        })
+
+    return firmware
+
+
+def _extract_md5_from_notes(info: dict) -> dict[str, str]:
+    """Extract MD5 hashes from the notes field."""
+    notes = info.get("notes", "")
+    md5_map = {}
+
+    for match in re.finditer(r'\(!\)\s+(.+?)\s+\(md5\):\s+([a-f0-9]{32})', notes):
+        filename = match.group(1).strip()
+        md5 = match.group(2)
+        md5_map[filename] = md5
+
+    return md5_map
+
+
+class Scraper(BaseScraper):
+    """Scraper for libretro-core-info firmware declarations."""
+
+    def __init__(self):
+        self._info_files: dict[str, dict] | None = None
+
+    def _fetch_info_list(self) -> list[str]:
+        """Fetch list of all .info files from GitHub API."""
+        # Use the tree API to get all files at once
+        url = f"{GITHUB_API}/git/trees/master?recursive=1"
+        try:
+            req = urllib.request.Request(url, headers={
+                "User-Agent": "retrobios-scraper/1.0",
+                "Accept": "application/vnd.github.v3+json",
+            })
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                data = json.loads(resp.read())
+
+            return [
+                item["path"] for item in data.get("tree", [])
+                if item["path"].endswith("_libretro.info")
+            ]
+        except (urllib.error.URLError, json.JSONDecodeError) as e:
+            raise ConnectionError(f"Failed to list core-info files: {e}") from e
+
+    def _fetch_info_file(self, filename: str) -> dict:
+        """Fetch and parse a single .info file."""
+        url = f"{RAW_BASE}/{filename}"
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                content = resp.read().decode("utf-8")
+            return _parse_info_file(content)
+        except (urllib.error.URLError, urllib.error.HTTPError):
+            return {}
+
+    def fetch_requirements(self) -> list[BiosRequirement]:
+        """Fetch firmware requirements from all core .info files."""
+        info_files = self._fetch_info_list()
+        requirements = []
+        seen = set()
+
+        for filename in info_files:
+            info = self._fetch_info_file(filename)
+            firmware_list = _extract_firmware(info)
+
+            if not firmware_list:
+                continue
+
+            core_name = filename.replace("_libretro.info", "")
+            system = CORE_SYSTEM_MAP.get(core_name, core_name)
+
+            md5_map = _extract_md5_from_notes(info)
+
+            for fw in firmware_list:
+                path = fw["path"]
+                if path in seen:
+                    continue
+                seen.add(path)
+
+                basename = path.split("/")[-1] if "/" in path else path
+                # Full path when basename is generic to avoid SGB1.sfc/program.rom vs SGB2.sfc/program.rom collisions
+                GENERIC_NAMES = {"program.rom", "data.rom", "boot.rom", "bios.bin", "firmware.bin"}
+                name = path if basename.lower() in GENERIC_NAMES else basename
+                md5 = md5_map.get(basename)
+
+                requirements.append(BiosRequirement(
+                    name=name,
+                    system=system,
+                    md5=md5,
+                    destination=path,
+                    required=not fw["optional"],
+                ))
+
+        return requirements
+
+    def validate_format(self, raw_data: str) -> bool:
+        """Validate .info file format."""
+        return "firmware_count" in raw_data or "display_name" in raw_data
+
+    def fetch_metadata(self) -> dict:
+        """Fetch version info from GitHub."""
+        version = fetch_github_latest_version("libretro/libretro-core-info")
+        return {"version": version or ""}
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Scrape libretro-core-info firmware requirements")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--compare-db", help="Compare against database.json")
+    args = parser.parse_args()
+
+    scraper = Scraper()
+
+    try:
+        reqs = scraper.fetch_requirements()
+    except ConnectionError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.compare_db:
+        import json as _json
+        with open(args.compare_db) as f:
+            db = _json.load(f)
+
+        found = 0
+        missing = []
+        for r in reqs:
+            if r.name in db["indexes"]["by_name"]:
+                found += 1
+            elif r.md5 and r.md5 in db["indexes"]["by_md5"]:
+                found += 1
+            else:
+                missing.append(r)
+
+        print(f"Core-info: {len(reqs)} unique firmware paths")
+        print(f"Found in DB: {found}")
+        print(f"Missing: {len(missing)}")
+        if missing:
+            print("\nMissing files:")
+            for r in sorted(missing, key=lambda x: x.system):
+                opt = "(optional)" if not r.required else "(REQUIRED)"
+                print(f"  {r.system}: {r.destination} {opt}")
+        return
+
+    from collections import defaultdict
+    by_system = defaultdict(list)
+    for r in reqs:
+        by_system[r.system].append(r)
+
+    print(f"Total: {len(reqs)} unique firmware paths across {len(by_system)} systems")
+    for sys_name, files in sorted(by_system.items()):
+        req_count = sum(1 for f in files if f.required)
+        opt_count = sum(1 for f in files if not f.required)
+        print(f"  {sys_name}: {req_count} required, {opt_count} optional")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/scraper/dat_parser.py
+++ b/scripts/scraper/dat_parser.py
@ -0,0 +1,167 @@
+"""Parser for clrmamepro DAT format.
+
+Parses files like libretro's System.dat which uses the format:
+    game (
+        name "System"
+        comment "Platform Name"
+        rom ( name filename size 12345 crc ABCD1234 md5 ... sha1 ... )
+    )
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+
+@dataclass
+class DatRom:
+    """A ROM entry from a DAT file."""
+    name: str
+    size: int
+    crc32: str
+    md5: str
+    sha1: str
+    system: str  # From the preceding comment line
+
+
+@dataclass
+class DatMetadata:
+    """Metadata from a DAT file header."""
+    name: str = ""
+    version: str = ""
+    description: str = ""
+    author: str = ""
+    homepage: str = ""
+    url: str = ""
+
+
+def parse_dat(content: str) -> list[DatRom]:
+    """Parse clrmamepro DAT content and return list of DatRom entries.
+
+    Handles:
+    - Quoted filenames with spaces: name "7800 BIOS (U).rom"
+    - Path filenames: name "pcsx2/bios/file.bin"
+    - Unquoted filenames: name cpc464.rom
+    - Inconsistent indentation (tabs vs spaces)
+    """
+    roms = []
+    current_system = ""
+
+    for line in content.split("\n"):
+        stripped = line.strip()
+
+        if stripped.startswith("comment "):
+            value = stripped[8:].strip().strip('"')
+            if value in ("System", "System, firmware, and BIOS files used by libretro cores."):
+                continue
+            current_system = value
+
+        elif stripped.startswith("rom (") or stripped.startswith("rom("):
+            rom = _parse_rom_line(stripped, current_system)
+            if rom:
+                roms.append(rom)
+
+    return roms
+
+
+def parse_dat_metadata(content: str) -> DatMetadata:
+    """Extract metadata from the clrmamepro header block."""
+    meta = DatMetadata()
+    in_header = False
+
+    for line in content.split("\n"):
+        stripped = line.strip()
+        if stripped.startswith("clrmamepro"):
+            in_header = True
+            continue
+        if in_header and stripped == ")":
+            break
+        if in_header:
+            for field in ("name", "version", "description", "author", "homepage", "url"):
+                if stripped.startswith(f"{field} "):
+                    value = stripped[len(field) + 1:].strip().strip('"')
+                    setattr(meta, field, value)
+
+    return meta
+
+
+def _parse_rom_line(line: str, system: str) -> DatRom | None:
+    """Parse a single rom ( ... ) line."""
+    # rfind because filenames may contain parentheses like "(E).rom"
+    start = line.find("(")
+    end = line.rfind(")")
+    if start == -1 or end == -1 or end <= start:
+        return None
+
+    content = line[start + 1:end].strip()
+
+    fields = {}
+    i = 0
+    tokens = _tokenize(content)
+
+    while i < len(tokens) - 1:
+        key = tokens[i]
+        value = tokens[i + 1]
+        fields[key] = value
+        i += 2
+
+    name = fields.get("name", "")
+    if not name:
+        return None
+
+    try:
+        size = int(fields.get("size", "0"))
+    except ValueError:
+        size = 0
+
+    return DatRom(
+        name=name,
+        size=size,
+        crc32=fields.get("crc", "").lower(),
+        md5=fields.get("md5", ""),
+        sha1=fields.get("sha1", ""),
+        system=system,
+    )
+
+
+def _tokenize(content: str) -> list[str]:
+    """Tokenize DAT content, handling quoted strings."""
+    tokens = []
+    i = 0
+    while i < len(content):
+        while i < len(content) and content[i] in (" ", "\t"):
+            i += 1
+        if i >= len(content):
+            break
+
+        if content[i] == '"':
+            i += 1
+            start = i
+            while i < len(content) and content[i] != '"':
+                i += 1
+            tokens.append(content[start:i])
+            i += 1
+        else:
+            start = i
+            while i < len(content) and content[i] not in (" ", "\t"):
+                i += 1
+            tokens.append(content[start:i])
+
+    return tokens
+
+
+def validate_dat_format(content: str) -> bool:
+    """Validate that content is a valid clrmamepro DAT file.
+
+    Checks for:
+    - clrmamepro header
+    - game block
+    - rom entries
+    """
+    has_header = "clrmamepro" in content[:500]
+    has_game = "game (" in content
+    has_rom = "rom (" in content or "rom(" in content
+    has_comment = 'comment "' in content
+
+    return has_header and has_game and has_rom and has_comment
--- a/scripts/scraper/libretro_scraper.py
+++ b/scripts/scraper/libretro_scraper.py
@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+"""Scraper for libretro System.dat (RetroArch, Lakka).
+
+Source: https://github.com/libretro/libretro-database/blob/master/dat/System.dat
+Format: clrmamepro DAT
+Hash: SHA1 primary
+"""
+
+from __future__ import annotations
+
+import sys
+import urllib.request
+import urllib.error
+
+from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
+from .dat_parser import parse_dat, parse_dat_metadata, validate_dat_format
+
+PLATFORM_NAME = "libretro"
+
+SOURCE_URL = (
+    "https://raw.githubusercontent.com/libretro/libretro-database/"
+    "master/dat/System.dat"
+)
+
+SYSTEM_SLUG_MAP = {
+    "3DO Company, The - 3DO": "3do",
+    "Amstrad - CPC": "amstrad-cpc",
+    "Arcade": "arcade",
+    "Atari - 400-800": "atari-400-800",
+    "Atari - 5200": "atari-5200",
+    "Atari - 7800": "atari-7800",
+    "Atari - Lynx": "atari-lynx",
+    "Atari - ST": "atari-st",
+    "Coleco - ColecoVision": "coleco-colecovision",
+    "Commodore - Amiga": "commodore-amiga",
+    "Commodore - C128": "commodore-c128",
+    "Dinothawr": "dinothawr",
+    "DOS": "dos",
+    "EPOCH/YENO Super Cassette Vision": "epoch-scv",
+    "Elektronika - BK-0010/BK-0011(M)": "elektronika-bk",
+    "Enterprise - 64/128": "enterprise-64-128",
+    "Fairchild Channel F": "fairchild-channel-f",
+    "Id Software - Doom": "doom",
+    "J2ME": "j2me",
+    "MacII": "apple-macintosh-ii",
+    "Magnavox - Odyssey2": "magnavox-odyssey2",
+    "Mattel - Intellivision": "mattel-intellivision",
+    "Microsoft - MSX": "microsoft-msx",
+    "NEC - PC Engine - TurboGrafx 16 - SuperGrafx": "nec-pc-engine",
+    "NEC - PC-98": "nec-pc-98",
+    "NEC - PC-FX": "nec-pc-fx",
+    "Nintendo - Famicom Disk System": "nintendo-fds",
+    "Nintendo - Game Boy Advance": "nintendo-gba",
+    "Nintendo - GameCube": "nintendo-gamecube",
+    "Nintendo - Gameboy": "nintendo-gb",
+    "Nintendo - Gameboy Color": "nintendo-gbc",
+    "Nintendo - Nintendo 64DD": "nintendo-64dd",
+    "Nintendo - Nintendo DS": "nintendo-ds",
+    "Nintendo - Nintendo Entertainment System": "nintendo-nes",
+    "Nintendo - Pokemon Mini": "nintendo-pokemon-mini",
+    "Nintendo - Satellaview": "nintendo-satellaview",
+    "Nintendo - SuFami Turbo": "nintendo-sufami-turbo",
+    "Nintendo - Super Game Boy": "nintendo-sgb",
+    "Nintendo - Super Nintendo Entertainment System": "nintendo-snes",
+    "Phillips - Videopac+": "philips-videopac",
+    "SNK - NeoGeo CD": "snk-neogeo-cd",
+    "ScummVM": "scummvm",
+    "Sega - Dreamcast": "sega-dreamcast",
+    "Sega - Dreamcast-based Arcade": "sega-dreamcast-arcade",
+    "Sega - Game Gear": "sega-game-gear",
+    "Sega - Master System - Mark III": "sega-master-system",
+    "Sega - Mega CD - Sega CD": "sega-mega-cd",
+    "Sega - Mega Drive - Genesis": "sega-mega-drive",
+    "Sega - Saturn": "sega-saturn",
+    "Sharp - X1": "sharp-x1",
+    "Sharp - X68000": "sharp-x68000",
+    "Sinclair - ZX Spectrum": "sinclair-zx-spectrum",
+    "Sony - PlayStation": "sony-playstation",
+    "Sony - PlayStation 2": "sony-playstation-2",
+    "Sony - PlayStation Portable": "sony-psp",
+    "Texas Instruments TI-83": "ti-83",
+    "Videoton - TV Computer": "videoton-tvc",
+    "Wolfenstein 3D": "wolfenstein-3d",
+}
+
+
+class Scraper(BaseScraper):
+    """Scraper for libretro System.dat."""
+
+    def __init__(self, url: str = SOURCE_URL):
+        self.url = url
+        self._raw_data: str | None = None
+
+    def _fetch_raw(self) -> str:
+        """Fetch raw DAT content from source URL."""
+        if self._raw_data is not None:
+            return self._raw_data
+
+        try:
+            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                self._raw_data = resp.read().decode("utf-8")
+                return self._raw_data
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
+
+    def fetch_requirements(self) -> list[BiosRequirement]:
+        """Parse System.dat and return BIOS requirements."""
+        raw = self._fetch_raw()
+
+        if not self.validate_format(raw):
+            raise ValueError("System.dat format validation failed")
+
+        roms = parse_dat(raw)
+        requirements = []
+
+        for rom in roms:
+            system_slug = SYSTEM_SLUG_MAP.get(rom.system, rom.system.lower().replace(" ", "-"))
+
+            destination = rom.name
+            name = rom.name.split("/")[-1] if "/" in rom.name else rom.name
+
+            requirements.append(BiosRequirement(
+                name=name,
+                system=system_slug,
+                sha1=rom.sha1 or None,
+                md5=rom.md5 or None,
+                crc32=rom.crc32 or None,
+                size=rom.size or None,
+                destination=destination,
+                required=True,
+            ))
+
+        return requirements
+
+    def validate_format(self, raw_data: str) -> bool:
+        """Validate System.dat format."""
+        return validate_dat_format(raw_data)
+
+    def fetch_metadata(self) -> dict:
+        """Fetch version info from System.dat header and GitHub API."""
+        raw = self._fetch_raw()
+        meta = parse_dat_metadata(raw)
+
+        retroarch_version = fetch_github_latest_version("libretro/RetroArch")
+        db_version = fetch_github_latest_version("libretro/libretro-database")
+
+        return {
+            "dat_version": meta.version,
+            "retroarch_version": retroarch_version,
+            "db_version": db_version,
+        }
+
+    def _fetch_core_metadata(self) -> dict[str, dict]:
+        """Fetch per-core metadata from libretro-core-info .info files."""
+        metadata = {}
+        try:
+            url = f"https://api.github.com/repos/libretro/libretro-core-info/git/trees/master?recursive=1"
+            req = urllib.request.Request(url, headers={
+                "User-Agent": "retrobios-scraper/1.0",
+                "Accept": "application/vnd.github.v3+json",
+            })
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                import json
+                tree = json.loads(resp.read())
+
+            info_files = [
+                item["path"] for item in tree.get("tree", [])
+                if item["path"].endswith("_libretro.info")
+            ]
+
+            for filename in info_files:
+                core_name = filename.replace("_libretro.info", "")
+                try:
+                    info_url = f"https://raw.githubusercontent.com/libretro/libretro-core-info/master/{filename}"
+                    req = urllib.request.Request(info_url, headers={"User-Agent": "retrobios-scraper/1.0"})
+                    with urllib.request.urlopen(req, timeout=10) as resp:
+                        content = resp.read().decode("utf-8")
+
+                    info = {}
+                    for line in content.split("\n"):
+                        line = line.strip()
+                        if " = " in line:
+                            key, _, value = line.partition(" = ")
+                            info[key.strip()] = value.strip().strip('"')
+
+                    fw_count = int(info.get("firmware_count", "0"))
+                    if fw_count == 0:
+                        continue
+
+                    system_name = info.get("systemname", "")
+                    manufacturer = info.get("manufacturer", "")
+                    display_name = info.get("display_name", "")
+                    categories = info.get("categories", "")
+
+                    # Map core to our system slug via firmware paths
+                    from .coreinfo_scraper import CORE_SYSTEM_MAP
+                    system_slug = CORE_SYSTEM_MAP.get(core_name)
+                    if not system_slug:
+                        continue
+
+                    if system_slug not in metadata:
+                        metadata[system_slug] = {
+                            "core": core_name,
+                            "manufacturer": manufacturer,
+                            "display_name": display_name or system_name,
+                            "docs": f"https://docs.libretro.com/library/{core_name}/",
+                        }
+                except (urllib.error.URLError, urllib.error.HTTPError):
+                    continue
+        except Exception:
+            pass
+
+        return metadata
+
+    def generate_platform_yaml(self) -> dict:
+        """Generate a platform YAML config dict, merging System.dat with core-info metadata."""
+        requirements = self.fetch_requirements()
+        metadata = self.fetch_metadata()
+        core_meta = self._fetch_core_metadata()
+
+        systems = {}
+        for req in requirements:
+            if req.system not in systems:
+                system_entry = {"files": []}
+                if req.system in core_meta:
+                    cm = core_meta[req.system]
+                    if cm.get("core"):
+                        system_entry["core"] = cm["core"]
+                    if cm.get("manufacturer"):
+                        system_entry["manufacturer"] = cm["manufacturer"]
+                    if cm.get("docs"):
+                        system_entry["docs"] = cm["docs"]
+                systems[req.system] = system_entry
+
+            entry = {
+                "name": req.name,
+                "destination": req.destination,
+                "required": req.required,
+            }
+            if req.sha1:
+                entry["sha1"] = req.sha1
+            if req.md5:
+                entry["md5"] = req.md5
+            if req.crc32:
+                entry["crc32"] = req.crc32
+            if req.size:
+                entry["size"] = req.size
+
+            systems[req.system]["files"].append(entry)
+
+        return {
+            "platform": "RetroArch",
+            "version": metadata["retroarch_version"] or "",
+            "dat_version": metadata["dat_version"] or "",
+            "homepage": "https://www.retroarch.com",
+            "source": "https://github.com/libretro/libretro-database/blob/master/dat/System.dat",
+            "base_destination": "system",
+            "hash_type": "sha1",
+            "verification_mode": "existence",
+            "systems": systems,
+        }
+
+
+def main():
+    """CLI entry point for testing."""
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Scrape libretro System.dat")
+    parser.add_argument("--dry-run", action="store_true", help="Just show what would be scraped")
+    parser.add_argument("--output", "-o", help="Output YAML file")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+
+    scraper = Scraper()
+
+    try:
+        reqs = scraper.fetch_requirements()
+    except (ConnectionError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.dry_run:
+        by_system = {}
+        for req in reqs:
+            by_system.setdefault(req.system, []).append(req)
+
+        for system, files in sorted(by_system.items()):
+            print(f"\n{system} ({len(files)} files):")
+            for f in files:
+                hash_info = f.sha1[:12] if f.sha1 else f.md5[:12] if f.md5 else "no-hash"
+                print(f"  {f.name} ({f.size or '?'} bytes, {hash_info}...)")
+
+        print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
+        return
+
+    if args.json:
+        config = scraper.generate_platform_yaml()
+        print(json.dumps(config, indent=2))
+        return
+
+    if args.output:
+        try:
+            import yaml
+        except ImportError:
+            print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr)
+            sys.exit(1)
+
+        config = scraper.generate_platform_yaml()
+        with open(args.output, "w") as f:
+            yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        print(f"Written to {args.output}")
+    else:
+        reqs = scraper.fetch_requirements()
+        by_system = {}
+        for req in reqs:
+            by_system.setdefault(req.system, []).append(req)
+        print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/scraper/recalbox_scraper.py
+++ b/scripts/scraper/recalbox_scraper.py
@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""Scraper for Recalbox BIOS requirements.
+
+Source: https://gitlab.com/recalbox/recalbox/-/raw/master/board/recalbox/fsoverlay/recalbox/share_init/system/.emulationstation/es_bios.xml
+Format: XML (es_bios.xml)
+Hash: MD5 (multiple valid hashes per entry, comma-separated)
+
+Recalbox verification logic:
+- Checks MD5 of file on disk against list of valid hashes
+- Multiple MD5s accepted per BIOS (different ROM revisions)
+- Alternate file paths (pipe-separated)
+- hashMatchMandatory flag: if false, wrong hash = warning (YELLOW) not error (RED)
+- ZIP files get composite MD5 calculation
+"""
+
+from __future__ import annotations
+
+import sys
+import urllib.request
+import urllib.error
+import xml.etree.ElementTree as ET
+
+from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
+
+PLATFORM_NAME = "recalbox"
+
+SOURCE_URL = (
+    "https://gitlab.com/recalbox/recalbox/-/raw/master/"
+    "board/recalbox/fsoverlay/recalbox/share_init/system/"
+    ".emulationstation/es_bios.xml"
+)
+
+SYSTEM_SLUG_MAP = {
+    "3do": "3do",
+    "amiga600": "commodore-amiga",
+    "amiga1200": "commodore-amiga",
+    "amigacd32": "commodore-amiga",
+    "amigacdtv": "commodore-amiga",
+    "amstradcpc": "amstrad-cpc",
+    "atari800": "atari-400-800",
+    "atari5200": "atari-5200",
+    "atari7800": "atari-7800",
+    "atarilynx": "atari-lynx",
+    "atarist": "atari-st",
+    "c64": "commodore-c64",
+    "channelf": "fairchild-channel-f",
+    "colecovision": "coleco-colecovision",
+    "dreamcast": "sega-dreamcast",
+    "fds": "nintendo-fds",
+    "gamecube": "nintendo-gamecube",
+    "gamegear": "sega-game-gear",
+    "gb": "nintendo-gb",
+    "gba": "nintendo-gba",
+    "gbc": "nintendo-gbc",
+    "intellivision": "mattel-intellivision",
+    "jaguar": "atari-jaguar",
+    "mastersystem": "sega-master-system",
+    "megadrive": "sega-mega-drive",
+    "msx": "microsoft-msx",
+    "msx1": "microsoft-msx",
+    "msx2": "microsoft-msx",
+    "n64": "nintendo-64",
+    "naomi": "sega-dreamcast-arcade",
+    "naomigd": "sega-dreamcast-arcade",
+    "atomiswave": "sega-dreamcast-arcade",
+    "nds": "nintendo-ds",
+    "neogeo": "snk-neogeo",
+    "neogeocd": "snk-neogeo-cd",
+    "o2em": "magnavox-odyssey2",
+    "pcengine": "nec-pc-engine",
+    "pcenginecd": "nec-pc-engine",
+    "pcfx": "nec-pc-fx",
+    "ps2": "sony-playstation-2",
+    "psx": "sony-playstation",
+    "saturn": "sega-saturn",
+    "scummvm": "scummvm",
+    "segacd": "sega-mega-cd",
+    "snes": "nintendo-snes",
+    "supergrafx": "nec-pc-engine",
+    "x68000": "sharp-x68000",
+    "zxspectrum": "sinclair-zx-spectrum",
+}
+
+
+class Scraper(BaseScraper):
+    """Scraper for Recalbox es_bios.xml."""
+
+    def __init__(self, url: str = SOURCE_URL):
+        self.url = url
+        self._raw_data: str | None = None
+
+    def _fetch_raw(self) -> str:
+        if self._raw_data is not None:
+            return self._raw_data
+
+        try:
+            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                self._raw_data = resp.read().decode("utf-8")
+                return self._raw_data
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
+
+    def fetch_requirements(self) -> list[BiosRequirement]:
+        """Parse es_bios.xml and return BIOS requirements."""
+        raw = self._fetch_raw()
+
+        if not self.validate_format(raw):
+            raise ValueError("es_bios.xml format validation failed")
+
+        root = ET.fromstring(raw)
+        requirements = []
+        seen = set()
+
+        for system_elem in root.findall(".//system"):
+            platform = system_elem.get("platform", "")
+            system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
+
+            for bios_elem in system_elem.findall("bios"):
+                paths_str = bios_elem.get("path", "")
+                md5_str = bios_elem.get("md5", "")
+                core = bios_elem.get("core", "")
+                mandatory = bios_elem.get("mandatory", "true") != "false"
+                hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
+                note = bios_elem.get("note", "")
+
+                paths = [p.strip() for p in paths_str.split("|") if p.strip()]
+                if not paths:
+                    continue
+
+                primary_path = paths[0]
+                name = primary_path.split("/")[-1] if "/" in primary_path else primary_path
+
+                md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
+                all_md5 = ",".join(md5_list) if md5_list else None
+
+                dedup_key = primary_path
+                if dedup_key in seen:
+                    continue
+                seen.add(dedup_key)
+
+                requirements.append(BiosRequirement(
+                    name=name,
+                    system=system_slug,
+                    md5=all_md5,
+                    destination=primary_path,
+                    required=mandatory,
+                ))
+
+        return requirements
+
+    def fetch_full_requirements(self) -> list[dict]:
+        """Parse es_bios.xml preserving all Recalbox-specific fields."""
+        raw = self._fetch_raw()
+        root = ET.fromstring(raw)
+        requirements = []
+
+        for system_elem in root.findall(".//system"):
+            platform = system_elem.get("platform", "")
+            system_name = system_elem.get("name", platform)
+            system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
+
+            for bios_elem in system_elem.findall("bios"):
+                paths_str = bios_elem.get("path", "")
+                md5_str = bios_elem.get("md5", "")
+                core = bios_elem.get("core", "")
+                mandatory = bios_elem.get("mandatory", "true") != "false"
+                hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
+                note = bios_elem.get("note", "")
+
+                paths = [p.strip() for p in paths_str.split("|") if p.strip()]
+                md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
+
+                if not paths:
+                    continue
+
+                name = paths[0].split("/")[-1] if "/" in paths[0] else paths[0]
+
+                requirements.append({
+                    "name": name,
+                    "system": system_slug,
+                    "system_name": system_name,
+                    "paths": paths,
+                    "md5_list": md5_list,
+                    "core": core,
+                    "mandatory": mandatory,
+                    "hash_match_mandatory": hash_match_mandatory,
+                    "note": note,
+                })
+
+        return requirements
+
+    def validate_format(self, raw_data: str) -> bool:
+        """Validate es_bios.xml format."""
+        return "<biosList" in raw_data and "<system" in raw_data and "<bios" in raw_data
+
+    def generate_platform_yaml(self) -> dict:
+        """Generate a platform YAML config dict from scraped data."""
+        requirements = self.fetch_requirements()
+
+        systems = {}
+        for req in requirements:
+            if req.system not in systems:
+                systems[req.system] = {"files": []}
+
+            entry = {
+                "name": req.name,
+                "destination": req.destination,
+                "required": req.required,
+            }
+            if req.md5:
+                entry["md5"] = req.md5
+
+            systems[req.system]["files"].append(entry)
+
+        version = fetch_github_latest_tag("recalbox/recalbox", prefix="") or ""
+        # Recalbox uses GitLab - GitHub API may not resolve
+        if not version:
+            version = "10.0"
+
+        return {
+            "platform": "Recalbox",
+            "version": version,
+            "homepage": "https://www.recalbox.com",
+            "source": SOURCE_URL,
+            "base_destination": "bios",
+            "hash_type": "md5",
+            "verification_mode": "md5",
+            "systems": systems,
+        }
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Scrape Recalbox es_bios.xml")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--json", action="store_true")
+    parser.add_argument("--full", action="store_true", help="Show full Recalbox-specific fields")
+    parser.add_argument("--output", "-o")
+    args = parser.parse_args()
+
+    scraper = Scraper()
+
+    try:
+        if args.full:
+            reqs = scraper.fetch_full_requirements()
+            print(json.dumps(reqs[:5], indent=2))
+            print(f"\nTotal: {len(reqs)} BIOS entries")
+            return
+        reqs = scraper.fetch_requirements()
+    except (ConnectionError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.dry_run:
+        from collections import defaultdict
+        by_system = defaultdict(list)
+        for r in reqs:
+            by_system[r.system].append(r)
+        for sys_name, files in sorted(by_system.items()):
+            print(f"\n{sys_name} ({len(files)} files):")
+            for f in files[:5]:
+                print(f"  {f.name} (md5={f.md5[:12] if f.md5 else 'N/A'}...)")
+            if len(files) > 5:
+                print(f"  ... +{len(files)-5} more")
+        print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
+        return
+
+    if args.json:
+        config = scraper.generate_platform_yaml()
+        print(json.dumps(config, indent=2))
+        return
+
+    reqs = scraper.fetch_requirements()
+    by_system = {}
+    for r in reqs:
+        by_system.setdefault(r.system, []).append(r)
+    print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
+
+
+if __name__ == "__main__":
+    main()