mirror of
https://github.com/Abdess/retrobios.git
synced 2026-06-28 13:22:48 +00:00
v2: automated BIOS platform with full pipeline
Reorganized 6 branches into bios/Manufacturer/Console/. Scrapers for RetroArch, Batocera, Recalbox, and libretro core-info. Platform-aware verification replicating native logic per platform. Pack generation with dedup, alias resolution, variant support. CI/CD: weekly auto-scrape, auto-release, PR validation. Large files (>50MB) stored as GitHub Release assets, auto-fetched at build time.
This commit is contained in:
parent
5f96368f6d
commit
13c561888d
7038 changed files with 3243612 additions and 29617 deletions
46
scripts/scraper/__init__.py
Normal file
46
scripts/scraper/__init__.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
"""Scraper plugin discovery module.
|
||||
|
||||
Auto-detects *_scraper.py files and exposes their scrapers.
|
||||
Each scraper module must define:
|
||||
PLATFORM_NAME: str
|
||||
Scraper: class inheriting BaseScraper
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import pkgutil
|
||||
from pathlib import Path
|
||||
|
||||
from .base_scraper import BaseScraper
|
||||
|
||||
_scrapers: dict[str, type] = {}
|
||||
|
||||
|
||||
def discover_scrapers() -> dict[str, type]:
|
||||
"""Auto-discover all *_scraper.py modules and return {platform_name: ScraperClass}."""
|
||||
if _scrapers:
|
||||
return _scrapers
|
||||
|
||||
package_dir = Path(__file__).parent
|
||||
|
||||
for finder, name, ispkg in pkgutil.iter_modules([str(package_dir)]):
|
||||
if not name.endswith("_scraper"):
|
||||
continue
|
||||
|
||||
module = importlib.import_module(f".{name}", package=__package__)
|
||||
|
||||
platform_name = getattr(module, "PLATFORM_NAME", None)
|
||||
scraper_class = getattr(module, "Scraper", None)
|
||||
|
||||
if platform_name and scraper_class and issubclass(scraper_class, BaseScraper):
|
||||
_scrapers[platform_name] = scraper_class
|
||||
|
||||
return _scrapers
|
||||
|
||||
|
||||
def get_scraper(platform_name: str) -> BaseScraper | None:
|
||||
"""Get an instantiated scraper for a platform."""
|
||||
scrapers = discover_scrapers()
|
||||
cls = scrapers.get(platform_name)
|
||||
return cls() if cls else None
|
||||
155
scripts/scraper/base_scraper.py
Normal file
155
scripts/scraper/base_scraper.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
"""Base scraper interface for platform BIOS requirement sources."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class BiosRequirement:
|
||||
"""A single BIOS file requirement from a platform source."""
|
||||
name: str
|
||||
system: str
|
||||
sha1: str | None = None
|
||||
md5: str | None = None
|
||||
crc32: str | None = None
|
||||
size: int | None = None
|
||||
destination: str = ""
|
||||
required: bool = True
|
||||
zipped_file: str | None = None # If set, md5 is for this ROM inside the ZIP
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChangeSet:
|
||||
"""Differences between scraped requirements and current config."""
|
||||
added: list[BiosRequirement] = field(default_factory=list)
|
||||
removed: list[BiosRequirement] = field(default_factory=list)
|
||||
modified: list[tuple[BiosRequirement, BiosRequirement]] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def has_changes(self) -> bool:
|
||||
return bool(self.added or self.removed or self.modified)
|
||||
|
||||
def summary(self) -> str:
|
||||
parts = []
|
||||
if self.added:
|
||||
parts.append(f"+{len(self.added)} added")
|
||||
if self.removed:
|
||||
parts.append(f"-{len(self.removed)} removed")
|
||||
if self.modified:
|
||||
parts.append(f"~{len(self.modified)} modified")
|
||||
return ", ".join(parts) if parts else "no changes"
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""Abstract base class for platform BIOS requirement scrapers."""
|
||||
|
||||
@abstractmethod
|
||||
def fetch_requirements(self) -> list[BiosRequirement]:
|
||||
"""Fetch current BIOS requirements from the platform source."""
|
||||
...
|
||||
|
||||
def compare_with_config(self, config: dict) -> ChangeSet:
|
||||
"""Compare scraped requirements against existing platform config."""
|
||||
scraped = self.fetch_requirements()
|
||||
changes = ChangeSet()
|
||||
|
||||
existing = {}
|
||||
for sys_id, system in config.get("systems", {}).items():
|
||||
for f in system.get("files", []):
|
||||
key = (sys_id, f["name"])
|
||||
existing[key] = f
|
||||
|
||||
scraped_map = {}
|
||||
for req in scraped:
|
||||
key = (req.system, req.name)
|
||||
scraped_map[key] = req
|
||||
|
||||
for key, req in scraped_map.items():
|
||||
if key not in existing:
|
||||
changes.added.append(req)
|
||||
else:
|
||||
existing_file = existing[key]
|
||||
if req.sha1 and existing_file.get("sha1") and req.sha1 != existing_file["sha1"]:
|
||||
changes.modified.append((
|
||||
BiosRequirement(
|
||||
name=existing_file["name"],
|
||||
system=key[0],
|
||||
sha1=existing_file.get("sha1"),
|
||||
md5=existing_file.get("md5"),
|
||||
),
|
||||
req,
|
||||
))
|
||||
elif req.md5 and existing_file.get("md5") and req.md5 != existing_file["md5"]:
|
||||
changes.modified.append((
|
||||
BiosRequirement(
|
||||
name=existing_file["name"],
|
||||
system=key[0],
|
||||
md5=existing_file.get("md5"),
|
||||
),
|
||||
req,
|
||||
))
|
||||
|
||||
for key in existing:
|
||||
if key not in scraped_map:
|
||||
f = existing[key]
|
||||
changes.removed.append(BiosRequirement(
|
||||
name=f["name"],
|
||||
system=key[0],
|
||||
sha1=f.get("sha1"),
|
||||
md5=f.get("md5"),
|
||||
))
|
||||
|
||||
return changes
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
"""Test if the source URL is reachable."""
|
||||
try:
|
||||
self.fetch_requirements()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@abstractmethod
|
||||
def validate_format(self, raw_data: str) -> bool:
|
||||
"""Validate source data format. Returns False if format has changed unexpectedly."""
|
||||
...
|
||||
|
||||
|
||||
def fetch_github_latest_version(repo: str) -> str | None:
|
||||
"""Fetch the latest release version tag from a GitHub repo."""
|
||||
url = f"https://api.github.com/repos/{repo}/releases/latest"
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": "retrobios-scraper/1.0",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("tag_name", "")
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def fetch_github_latest_tag(repo: str, prefix: str = "") -> str | None:
|
||||
"""Fetch the most recent matching tag from a GitHub repo."""
|
||||
url = f"https://api.github.com/repos/{repo}/tags?per_page=50"
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": "retrobios-scraper/1.0",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
tags = json.loads(resp.read())
|
||||
for tag in tags:
|
||||
name = tag["name"]
|
||||
if prefix and not name.startswith(prefix):
|
||||
continue
|
||||
return name
|
||||
return tags[0]["name"] if tags else None
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError):
|
||||
return None
|
||||
315
scripts/scraper/batocera_scraper.py
Normal file
315
scripts/scraper/batocera_scraper.py
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Scraper for Batocera batocera-systems.
|
||||
|
||||
Source: https://github.com/batocera-linux/batocera.linux/.../batocera-systems
|
||||
Format: Python dict with systems -> biosFiles
|
||||
Hash: MD5 primary
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
|
||||
|
||||
PLATFORM_NAME = "batocera"
|
||||
|
||||
SOURCE_URL = (
|
||||
"https://raw.githubusercontent.com/batocera-linux/batocera.linux/"
|
||||
"master/package/batocera/core/batocera-scripts/scripts/batocera-systems"
|
||||
)
|
||||
|
||||
SYSTEM_SLUG_MAP = {
|
||||
"atari800": "atari-400-800",
|
||||
"atari5200": "atari-5200",
|
||||
"atarist": "atari-st",
|
||||
"lynx": "atari-lynx",
|
||||
"3do": "3do",
|
||||
"amiga": "commodore-amiga",
|
||||
"amiga600": "commodore-amiga",
|
||||
"amiga1200": "commodore-amiga",
|
||||
"amigacd32": "commodore-amiga",
|
||||
"amigacdtv": "commodore-amiga",
|
||||
"c128": "commodore-c128",
|
||||
"colecovision": "coleco-colecovision",
|
||||
"dreamcast": "sega-dreamcast",
|
||||
"naomi": "sega-dreamcast-arcade",
|
||||
"naomi2": "sega-dreamcast-arcade",
|
||||
"atomiswave": "sega-dreamcast-arcade",
|
||||
"fds": "nintendo-fds",
|
||||
"gamecube": "nintendo-gamecube",
|
||||
"gb": "nintendo-gb",
|
||||
"gba": "nintendo-gba",
|
||||
"gbc": "nintendo-gbc",
|
||||
"nds": "nintendo-ds",
|
||||
"n64dd": "nintendo-64dd",
|
||||
"satellaview": "nintendo-satellaview",
|
||||
"sgb": "nintendo-sgb",
|
||||
"snes": "nintendo-snes",
|
||||
"channelf": "fairchild-channel-f",
|
||||
"intellivision": "mattel-intellivision",
|
||||
"msx": "microsoft-msx",
|
||||
"msx1": "microsoft-msx",
|
||||
"msx2": "microsoft-msx",
|
||||
"msxturbor": "microsoft-msx",
|
||||
"neogeo": "snk-neogeo",
|
||||
"neogeocd": "snk-neogeo-cd",
|
||||
"odyssey2": "magnavox-odyssey2",
|
||||
"pcengine": "nec-pc-engine",
|
||||
"pcenginecd": "nec-pc-engine",
|
||||
"supergrafx": "nec-pc-engine",
|
||||
"pc88": "nec-pc-88",
|
||||
"pc98": "nec-pc-98",
|
||||
"pcfx": "nec-pc-fx",
|
||||
"psx": "sony-playstation",
|
||||
"ps2": "sony-playstation-2",
|
||||
"psp": "sony-psp",
|
||||
"saturn": "sega-saturn",
|
||||
"segacd": "sega-mega-cd",
|
||||
"mastersystem": "sega-master-system",
|
||||
"megadrive": "sega-mega-drive",
|
||||
"gamegear": "sega-game-gear",
|
||||
"x1": "sharp-x1",
|
||||
"x68000": "sharp-x68000",
|
||||
"zxspectrum": "sinclair-zx-spectrum",
|
||||
"scummvm": "scummvm",
|
||||
"doom": "doom",
|
||||
"macintosh": "apple-macintosh-ii",
|
||||
"dos": "dos",
|
||||
"videopac": "philips-videopac",
|
||||
"pokemini": "nintendo-pokemon-mini",
|
||||
}
|
||||
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper for batocera-systems Python dict."""
|
||||
|
||||
def __init__(self, url: str = SOURCE_URL):
|
||||
self.url = url
|
||||
self._raw_data: str | None = None
|
||||
|
||||
def _fetch_raw(self) -> str:
|
||||
if self._raw_data is not None:
|
||||
return self._raw_data
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
self._raw_data = resp.read().decode("utf-8")
|
||||
return self._raw_data
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
|
||||
|
||||
def _extract_systems_dict(self, raw: str) -> dict:
|
||||
"""Extract and parse the 'systems' dict from the Python source via ast.literal_eval."""
|
||||
match = re.search(r'^systems\s*=\s*\{', raw, re.MULTILINE)
|
||||
if not match:
|
||||
raise ValueError("Could not find 'systems = {' in batocera-systems")
|
||||
|
||||
start = match.start() + raw[match.start():].index("{")
|
||||
depth = 0
|
||||
i = start
|
||||
while i < len(raw):
|
||||
if raw[i] == "{":
|
||||
depth += 1
|
||||
elif raw[i] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
elif raw[i] == "#":
|
||||
while i < len(raw) and raw[i] != "\n":
|
||||
i += 1
|
||||
i += 1
|
||||
|
||||
dict_str = raw[start:i + 1]
|
||||
|
||||
lines = []
|
||||
for line in dict_str.split("\n"):
|
||||
in_string = False
|
||||
string_char = None
|
||||
clean = []
|
||||
for j, ch in enumerate(line):
|
||||
if ch in ('"', "'") and not in_string:
|
||||
in_string = True
|
||||
string_char = ch
|
||||
clean.append(ch)
|
||||
elif ch == string_char and in_string:
|
||||
in_string = False
|
||||
clean.append(ch)
|
||||
elif ch == "#" and not in_string:
|
||||
break
|
||||
else:
|
||||
clean.append(ch)
|
||||
lines.append("".join(clean))
|
||||
|
||||
clean_dict_str = "\n".join(lines)
|
||||
|
||||
clean_dict_str = clean_dict_str.replace("OrderedDict(", "dict(")
|
||||
|
||||
try:
|
||||
return ast.literal_eval(clean_dict_str)
|
||||
except (SyntaxError, ValueError) as e:
|
||||
raise ValueError(f"Failed to parse systems dict: {e}") from e
|
||||
|
||||
def fetch_requirements(self) -> list[BiosRequirement]:
|
||||
"""Parse batocera-systems and return BIOS requirements."""
|
||||
raw = self._fetch_raw()
|
||||
|
||||
if not self.validate_format(raw):
|
||||
raise ValueError("batocera-systems format validation failed")
|
||||
|
||||
systems = self._extract_systems_dict(raw)
|
||||
requirements = []
|
||||
|
||||
for sys_key, sys_data in systems.items():
|
||||
system_slug = SYSTEM_SLUG_MAP.get(sys_key, sys_key)
|
||||
bios_files = sys_data.get("biosFiles", [])
|
||||
|
||||
for bios in bios_files:
|
||||
file_path = bios.get("file", "")
|
||||
md5 = bios.get("md5", "")
|
||||
zipped_file = bios.get("zippedFile", "")
|
||||
|
||||
if file_path.startswith("bios/"):
|
||||
file_path = file_path[5:]
|
||||
|
||||
name = file_path.split("/")[-1] if "/" in file_path else file_path
|
||||
|
||||
requirements.append(BiosRequirement(
|
||||
name=name,
|
||||
system=system_slug,
|
||||
md5=md5 or None,
|
||||
destination=file_path,
|
||||
required=True,
|
||||
zipped_file=zipped_file or None,
|
||||
))
|
||||
|
||||
return requirements
|
||||
|
||||
def validate_format(self, raw_data: str) -> bool:
|
||||
"""Validate batocera-systems format."""
|
||||
has_systems = "systems" in raw_data and "biosFiles" in raw_data
|
||||
has_dict = re.search(r'^systems\s*=\s*\{', raw_data, re.MULTILINE) is not None
|
||||
has_md5 = '"md5"' in raw_data
|
||||
has_file = '"file"' in raw_data
|
||||
return has_systems and has_dict and has_md5 and has_file
|
||||
|
||||
def generate_platform_yaml(self) -> dict:
|
||||
"""Generate a platform YAML config dict from scraped data."""
|
||||
requirements = self.fetch_requirements()
|
||||
|
||||
systems = {}
|
||||
for req in requirements:
|
||||
if req.system not in systems:
|
||||
systems[req.system] = {"files": []}
|
||||
|
||||
entry = {
|
||||
"name": req.name,
|
||||
"destination": req.destination,
|
||||
"required": req.required,
|
||||
}
|
||||
if req.md5:
|
||||
entry["md5"] = req.md5
|
||||
if req.zipped_file:
|
||||
entry["zipped_file"] = req.zipped_file
|
||||
|
||||
systems[req.system]["files"].append(entry)
|
||||
|
||||
# Sort numerically since API returns by commit date, not version
|
||||
import json as _json
|
||||
batocera_version = ""
|
||||
try:
|
||||
_url = "https://api.github.com/repos/batocera-linux/batocera.linux/tags?per_page=50"
|
||||
_req = urllib.request.Request(_url, headers={
|
||||
"User-Agent": "retrobios-scraper/1.0",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
})
|
||||
with urllib.request.urlopen(_req, timeout=15) as _resp:
|
||||
_tags = _json.loads(_resp.read())
|
||||
_versions = []
|
||||
for _t in _tags:
|
||||
_name = _t["name"]
|
||||
if _name.startswith("batocera-"):
|
||||
_num = _name.replace("batocera-", "")
|
||||
if _num.isdigit():
|
||||
_versions.append(int(_num))
|
||||
if _versions:
|
||||
batocera_version = str(max(_versions))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"platform": "Batocera",
|
||||
"version": batocera_version or "",
|
||||
"homepage": "https://batocera.org",
|
||||
"source": SOURCE_URL,
|
||||
"base_destination": "bios",
|
||||
"hash_type": "md5",
|
||||
"verification_mode": "md5",
|
||||
"systems": systems,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for testing."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Scrape batocera-systems")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--json", action="store_true")
|
||||
parser.add_argument("--output", "-o")
|
||||
args = parser.parse_args()
|
||||
|
||||
scraper = Scraper()
|
||||
|
||||
try:
|
||||
reqs = scraper.fetch_requirements()
|
||||
except (ConnectionError, ValueError) as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
by_system = {}
|
||||
for req in reqs:
|
||||
by_system.setdefault(req.system, []).append(req)
|
||||
|
||||
for system, files in sorted(by_system.items()):
|
||||
print(f"\n{system} ({len(files)} files):")
|
||||
for f in files:
|
||||
hash_info = f.md5[:12] if f.md5 else "no-hash"
|
||||
print(f" {f.name} ({hash_info}...)")
|
||||
|
||||
print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
return
|
||||
|
||||
if args.json:
|
||||
config = scraper.generate_platform_yaml()
|
||||
print(json.dumps(config, indent=2))
|
||||
return
|
||||
|
||||
if args.output:
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("Error: PyYAML required", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
config = scraper.generate_platform_yaml()
|
||||
with open(args.output, "w") as f:
|
||||
yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
print(f"Written to {args.output}")
|
||||
else:
|
||||
reqs = scraper.fetch_requirements()
|
||||
by_system = {}
|
||||
for req in reqs:
|
||||
by_system.setdefault(req.system, []).append(req)
|
||||
print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
334
scripts/scraper/coreinfo_scraper.py
Normal file
334
scripts/scraper/coreinfo_scraper.py
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Scraper for libretro-core-info firmware declarations.
|
||||
|
||||
Source: https://github.com/libretro/libretro-core-info
|
||||
Format: .info files with firmware0_path, firmware0_desc, firmware0_opt patterns
|
||||
Hash: From notes field (MD5) or cross-referenced with System.dat
|
||||
|
||||
Complements libretro_scraper (System.dat) with:
|
||||
- Exact firmware paths per core
|
||||
- Required vs optional status
|
||||
- Firmware for cores not covered by System.dat
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import json
|
||||
|
||||
try:
|
||||
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
|
||||
except ImportError:
|
||||
# Allow running directly: python scripts/scraper/coreinfo_scraper.py
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from scraper.base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
|
||||
|
||||
PLATFORM_NAME = "libretro_coreinfo"
|
||||
|
||||
GITHUB_API = "https://api.github.com/repos/libretro/libretro-core-info"
|
||||
RAW_BASE = "https://raw.githubusercontent.com/libretro/libretro-core-info/master"
|
||||
|
||||
CORE_SYSTEM_MAP = {
|
||||
"pcsx_rearmed": "sony-playstation",
|
||||
"mednafen_psx": "sony-playstation",
|
||||
"mednafen_psx_hw": "sony-playstation",
|
||||
"swanstation": "sony-playstation",
|
||||
"duckstation": "sony-playstation",
|
||||
"pcsx1": "sony-playstation",
|
||||
"lrps2": "sony-playstation-2",
|
||||
"play": "sony-playstation-2",
|
||||
"ppsspp": "sony-psp",
|
||||
"fbneo": "arcade",
|
||||
"mame": "arcade",
|
||||
"mame2003": "arcade",
|
||||
"mame2003_plus": "arcade",
|
||||
"dolphin": "nintendo-gamecube",
|
||||
"melonds": "nintendo-ds",
|
||||
"melonds_ds": "nintendo-ds",
|
||||
"desmume": "nintendo-ds",
|
||||
"mgba": "nintendo-gba",
|
||||
"vba_next": "nintendo-gba",
|
||||
"gpsp": "nintendo-gba",
|
||||
"gambatte": "nintendo-gb",
|
||||
"sameboy": "nintendo-gb",
|
||||
"gearboy": "nintendo-gb",
|
||||
"bsnes": "nintendo-snes",
|
||||
"snes9x": "nintendo-snes",
|
||||
"higan_sfc": "nintendo-snes",
|
||||
"mesen-s": "nintendo-snes",
|
||||
"nestopia": "nintendo-nes",
|
||||
"fceumm": "nintendo-nes",
|
||||
"mesen": "nintendo-nes",
|
||||
"mupen64plus_next": "nintendo-64",
|
||||
"parallel_n64": "nintendo-64",
|
||||
"flycast": "sega-dreamcast",
|
||||
"reicast": "sega-dreamcast",
|
||||
"kronos": "sega-saturn",
|
||||
"mednafen_saturn": "sega-saturn",
|
||||
"yabause": "sega-saturn",
|
||||
"genesis_plus_gx": "sega-mega-drive",
|
||||
"picodrive": "sega-mega-drive",
|
||||
"mednafen_pce": "nec-pc-engine",
|
||||
"mednafen_pce_fast": "nec-pc-engine",
|
||||
"mednafen_pcfx": "nec-pc-fx",
|
||||
"mednafen_ngp": "snk-neogeo-pocket",
|
||||
"mednafen_lynx": "atari-lynx",
|
||||
"handy": "atari-lynx",
|
||||
"hatari": "atari-st",
|
||||
"puae": "commodore-amiga",
|
||||
"fuse": "sinclair-zx-spectrum",
|
||||
"dosbox_pure": "dos",
|
||||
"dosbox_svn": "dos",
|
||||
"scummvm": "scummvm",
|
||||
"opera": "3do",
|
||||
"4do": "3do",
|
||||
"ep128emu": "enterprise-64-128",
|
||||
"freej2me": "j2me",
|
||||
"squirreljme": "j2me",
|
||||
"numero": "ti-83",
|
||||
"neocd": "snk-neogeo-cd",
|
||||
"vice_x64": "commodore-c64",
|
||||
"vice_x128": "commodore-c128",
|
||||
"cap32": "amstrad-cpc",
|
||||
"o2em": "magnavox-odyssey2",
|
||||
"vecx": "vectrex",
|
||||
"virtualjaguar": "atari-jaguar",
|
||||
"prosystem": "atari-7800",
|
||||
"stella": "atari-2600",
|
||||
"a5200": "atari-5200",
|
||||
"bluemsx": "microsoft-msx",
|
||||
"fmsx": "microsoft-msx",
|
||||
"px68k": "sharp-x68000",
|
||||
"x1": "sharp-x1",
|
||||
"quasi88": "nec-pc-88",
|
||||
"np2kai": "nec-pc-98",
|
||||
"theodore": "thomson",
|
||||
"81": "sinclair-zx81",
|
||||
"crocods": "amstrad-cpc",
|
||||
"dinothawr": "dinothawr",
|
||||
}
|
||||
|
||||
|
||||
def _parse_info_file(content: str) -> dict:
|
||||
"""Parse a .info file into a dictionary."""
|
||||
result = {}
|
||||
for line in content.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
match = re.match(r'^(\w+)\s*=\s*"?(.*?)"?\s*$', line)
|
||||
if match:
|
||||
key, value = match.group(1), match.group(2)
|
||||
result[key] = value
|
||||
return result
|
||||
|
||||
|
||||
_SKIP_EXTENSIONS = {".dll", ".so", ".dylib", ".exe", ".bat", ".sh"}
|
||||
_DIRECTORY_MARKERS = {"folder", "directory", "dir"}
|
||||
|
||||
|
||||
def _is_directory_ref(path: str, desc: str) -> bool:
|
||||
"""Check if a firmware entry is a directory reference rather than a file."""
|
||||
if "." not in path.split("/")[-1]:
|
||||
return True
|
||||
desc_lower = desc.lower()
|
||||
return any(marker in desc_lower for marker in _DIRECTORY_MARKERS)
|
||||
|
||||
|
||||
def _is_native_lib(path: str) -> bool:
|
||||
"""Check if path is a native library (.dll, .so, .dylib) - not a BIOS."""
|
||||
ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
|
||||
return ext.lower() in _SKIP_EXTENSIONS
|
||||
|
||||
|
||||
def _extract_firmware(info: dict) -> list[dict]:
|
||||
"""Extract firmware entries, filtering out directories and native libraries."""
|
||||
count_str = info.get("firmware_count", "0")
|
||||
try:
|
||||
count = int(count_str)
|
||||
except ValueError:
|
||||
return []
|
||||
|
||||
firmware = []
|
||||
for i in range(count):
|
||||
path = info.get(f"firmware{i}_path", "")
|
||||
desc = info.get(f"firmware{i}_desc", "")
|
||||
opt = info.get(f"firmware{i}_opt", "false")
|
||||
|
||||
if not path:
|
||||
continue
|
||||
|
||||
if _is_directory_ref(path, desc):
|
||||
continue
|
||||
|
||||
if _is_native_lib(path):
|
||||
continue
|
||||
|
||||
firmware.append({
|
||||
"path": path,
|
||||
"desc": desc,
|
||||
"optional": opt.lower() == "true",
|
||||
})
|
||||
|
||||
return firmware
|
||||
|
||||
|
||||
def _extract_md5_from_notes(info: dict) -> dict[str, str]:
|
||||
"""Extract MD5 hashes from the notes field."""
|
||||
notes = info.get("notes", "")
|
||||
md5_map = {}
|
||||
|
||||
for match in re.finditer(r'\(!\)\s+(.+?)\s+\(md5\):\s+([a-f0-9]{32})', notes):
|
||||
filename = match.group(1).strip()
|
||||
md5 = match.group(2)
|
||||
md5_map[filename] = md5
|
||||
|
||||
return md5_map
|
||||
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper for libretro-core-info firmware declarations."""
|
||||
|
||||
def __init__(self):
|
||||
self._info_files: dict[str, dict] | None = None
|
||||
|
||||
def _fetch_info_list(self) -> list[str]:
|
||||
"""Fetch list of all .info files from GitHub API."""
|
||||
# Use the tree API to get all files at once
|
||||
url = f"{GITHUB_API}/git/trees/master?recursive=1"
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": "retrobios-scraper/1.0",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
data = json.loads(resp.read())
|
||||
|
||||
return [
|
||||
item["path"] for item in data.get("tree", [])
|
||||
if item["path"].endswith("_libretro.info")
|
||||
]
|
||||
except (urllib.error.URLError, json.JSONDecodeError) as e:
|
||||
raise ConnectionError(f"Failed to list core-info files: {e}") from e
|
||||
|
||||
def _fetch_info_file(self, filename: str) -> dict:
|
||||
"""Fetch and parse a single .info file."""
|
||||
url = f"{RAW_BASE}/{filename}"
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "retrobios-scraper/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
content = resp.read().decode("utf-8")
|
||||
return _parse_info_file(content)
|
||||
except (urllib.error.URLError, urllib.error.HTTPError):
|
||||
return {}
|
||||
|
||||
def fetch_requirements(self) -> list[BiosRequirement]:
|
||||
"""Fetch firmware requirements from all core .info files."""
|
||||
info_files = self._fetch_info_list()
|
||||
requirements = []
|
||||
seen = set()
|
||||
|
||||
for filename in info_files:
|
||||
info = self._fetch_info_file(filename)
|
||||
firmware_list = _extract_firmware(info)
|
||||
|
||||
if not firmware_list:
|
||||
continue
|
||||
|
||||
core_name = filename.replace("_libretro.info", "")
|
||||
system = CORE_SYSTEM_MAP.get(core_name, core_name)
|
||||
|
||||
md5_map = _extract_md5_from_notes(info)
|
||||
|
||||
for fw in firmware_list:
|
||||
path = fw["path"]
|
||||
if path in seen:
|
||||
continue
|
||||
seen.add(path)
|
||||
|
||||
basename = path.split("/")[-1] if "/" in path else path
|
||||
# Full path when basename is generic to avoid SGB1.sfc/program.rom vs SGB2.sfc/program.rom collisions
|
||||
GENERIC_NAMES = {"program.rom", "data.rom", "boot.rom", "bios.bin", "firmware.bin"}
|
||||
name = path if basename.lower() in GENERIC_NAMES else basename
|
||||
md5 = md5_map.get(basename)
|
||||
|
||||
requirements.append(BiosRequirement(
|
||||
name=name,
|
||||
system=system,
|
||||
md5=md5,
|
||||
destination=path,
|
||||
required=not fw["optional"],
|
||||
))
|
||||
|
||||
return requirements
|
||||
|
||||
def validate_format(self, raw_data: str) -> bool:
|
||||
"""Validate .info file format."""
|
||||
return "firmware_count" in raw_data or "display_name" in raw_data
|
||||
|
||||
def fetch_metadata(self) -> dict:
|
||||
"""Fetch version info from GitHub."""
|
||||
version = fetch_github_latest_version("libretro/libretro-core-info")
|
||||
return {"version": version or ""}
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Scrape libretro-core-info firmware requirements")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--compare-db", help="Compare against database.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
scraper = Scraper()
|
||||
|
||||
try:
|
||||
reqs = scraper.fetch_requirements()
|
||||
except ConnectionError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.compare_db:
|
||||
import json as _json
|
||||
with open(args.compare_db) as f:
|
||||
db = _json.load(f)
|
||||
|
||||
found = 0
|
||||
missing = []
|
||||
for r in reqs:
|
||||
if r.name in db["indexes"]["by_name"]:
|
||||
found += 1
|
||||
elif r.md5 and r.md5 in db["indexes"]["by_md5"]:
|
||||
found += 1
|
||||
else:
|
||||
missing.append(r)
|
||||
|
||||
print(f"Core-info: {len(reqs)} unique firmware paths")
|
||||
print(f"Found in DB: {found}")
|
||||
print(f"Missing: {len(missing)}")
|
||||
if missing:
|
||||
print("\nMissing files:")
|
||||
for r in sorted(missing, key=lambda x: x.system):
|
||||
opt = "(optional)" if not r.required else "(REQUIRED)"
|
||||
print(f" {r.system}: {r.destination} {opt}")
|
||||
return
|
||||
|
||||
from collections import defaultdict
|
||||
by_system = defaultdict(list)
|
||||
for r in reqs:
|
||||
by_system[r.system].append(r)
|
||||
|
||||
print(f"Total: {len(reqs)} unique firmware paths across {len(by_system)} systems")
|
||||
for sys_name, files in sorted(by_system.items()):
|
||||
req_count = sum(1 for f in files if f.required)
|
||||
opt_count = sum(1 for f in files if not f.required)
|
||||
print(f" {sys_name}: {req_count} required, {opt_count} optional")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
167
scripts/scraper/dat_parser.py
Normal file
167
scripts/scraper/dat_parser.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
"""Parser for clrmamepro DAT format.
|
||||
|
||||
Parses files like libretro's System.dat which uses the format:
|
||||
game (
|
||||
name "System"
|
||||
comment "Platform Name"
|
||||
rom ( name filename size 12345 crc ABCD1234 md5 ... sha1 ... )
|
||||
)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatRom:
|
||||
"""A ROM entry from a DAT file."""
|
||||
name: str
|
||||
size: int
|
||||
crc32: str
|
||||
md5: str
|
||||
sha1: str
|
||||
system: str # From the preceding comment line
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatMetadata:
|
||||
"""Metadata from a DAT file header."""
|
||||
name: str = ""
|
||||
version: str = ""
|
||||
description: str = ""
|
||||
author: str = ""
|
||||
homepage: str = ""
|
||||
url: str = ""
|
||||
|
||||
|
||||
def parse_dat(content: str) -> list[DatRom]:
|
||||
"""Parse clrmamepro DAT content and return list of DatRom entries.
|
||||
|
||||
Handles:
|
||||
- Quoted filenames with spaces: name "7800 BIOS (U).rom"
|
||||
- Path filenames: name "pcsx2/bios/file.bin"
|
||||
- Unquoted filenames: name cpc464.rom
|
||||
- Inconsistent indentation (tabs vs spaces)
|
||||
"""
|
||||
roms = []
|
||||
current_system = ""
|
||||
|
||||
for line in content.split("\n"):
|
||||
stripped = line.strip()
|
||||
|
||||
if stripped.startswith("comment "):
|
||||
value = stripped[8:].strip().strip('"')
|
||||
if value in ("System", "System, firmware, and BIOS files used by libretro cores."):
|
||||
continue
|
||||
current_system = value
|
||||
|
||||
elif stripped.startswith("rom (") or stripped.startswith("rom("):
|
||||
rom = _parse_rom_line(stripped, current_system)
|
||||
if rom:
|
||||
roms.append(rom)
|
||||
|
||||
return roms
|
||||
|
||||
|
||||
def parse_dat_metadata(content: str) -> DatMetadata:
|
||||
"""Extract metadata from the clrmamepro header block."""
|
||||
meta = DatMetadata()
|
||||
in_header = False
|
||||
|
||||
for line in content.split("\n"):
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("clrmamepro"):
|
||||
in_header = True
|
||||
continue
|
||||
if in_header and stripped == ")":
|
||||
break
|
||||
if in_header:
|
||||
for field in ("name", "version", "description", "author", "homepage", "url"):
|
||||
if stripped.startswith(f"{field} "):
|
||||
value = stripped[len(field) + 1:].strip().strip('"')
|
||||
setattr(meta, field, value)
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def _parse_rom_line(line: str, system: str) -> DatRom | None:
|
||||
"""Parse a single rom ( ... ) line."""
|
||||
# rfind because filenames may contain parentheses like "(E).rom"
|
||||
start = line.find("(")
|
||||
end = line.rfind(")")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
return None
|
||||
|
||||
content = line[start + 1:end].strip()
|
||||
|
||||
fields = {}
|
||||
i = 0
|
||||
tokens = _tokenize(content)
|
||||
|
||||
while i < len(tokens) - 1:
|
||||
key = tokens[i]
|
||||
value = tokens[i + 1]
|
||||
fields[key] = value
|
||||
i += 2
|
||||
|
||||
name = fields.get("name", "")
|
||||
if not name:
|
||||
return None
|
||||
|
||||
try:
|
||||
size = int(fields.get("size", "0"))
|
||||
except ValueError:
|
||||
size = 0
|
||||
|
||||
return DatRom(
|
||||
name=name,
|
||||
size=size,
|
||||
crc32=fields.get("crc", "").lower(),
|
||||
md5=fields.get("md5", ""),
|
||||
sha1=fields.get("sha1", ""),
|
||||
system=system,
|
||||
)
|
||||
|
||||
|
||||
def _tokenize(content: str) -> list[str]:
|
||||
"""Tokenize DAT content, handling quoted strings."""
|
||||
tokens = []
|
||||
i = 0
|
||||
while i < len(content):
|
||||
while i < len(content) and content[i] in (" ", "\t"):
|
||||
i += 1
|
||||
if i >= len(content):
|
||||
break
|
||||
|
||||
if content[i] == '"':
|
||||
i += 1
|
||||
start = i
|
||||
while i < len(content) and content[i] != '"':
|
||||
i += 1
|
||||
tokens.append(content[start:i])
|
||||
i += 1
|
||||
else:
|
||||
start = i
|
||||
while i < len(content) and content[i] not in (" ", "\t"):
|
||||
i += 1
|
||||
tokens.append(content[start:i])
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def validate_dat_format(content: str) -> bool:
|
||||
"""Validate that content is a valid clrmamepro DAT file.
|
||||
|
||||
Checks for:
|
||||
- clrmamepro header
|
||||
- game block
|
||||
- rom entries
|
||||
"""
|
||||
has_header = "clrmamepro" in content[:500]
|
||||
has_game = "game (" in content
|
||||
has_rom = "rom (" in content or "rom(" in content
|
||||
has_comment = 'comment "' in content
|
||||
|
||||
return has_header and has_game and has_rom and has_comment
|
||||
323
scripts/scraper/libretro_scraper.py
Normal file
323
scripts/scraper/libretro_scraper.py
Normal file
|
|
@ -0,0 +1,323 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Scraper for libretro System.dat (RetroArch, Lakka).
|
||||
|
||||
Source: https://github.com/libretro/libretro-database/blob/master/dat/System.dat
|
||||
Format: clrmamepro DAT
|
||||
Hash: SHA1 primary
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_version
|
||||
from .dat_parser import parse_dat, parse_dat_metadata, validate_dat_format
|
||||
|
||||
PLATFORM_NAME = "libretro"
|
||||
|
||||
SOURCE_URL = (
|
||||
"https://raw.githubusercontent.com/libretro/libretro-database/"
|
||||
"master/dat/System.dat"
|
||||
)
|
||||
|
||||
SYSTEM_SLUG_MAP = {
|
||||
"3DO Company, The - 3DO": "3do",
|
||||
"Amstrad - CPC": "amstrad-cpc",
|
||||
"Arcade": "arcade",
|
||||
"Atari - 400-800": "atari-400-800",
|
||||
"Atari - 5200": "atari-5200",
|
||||
"Atari - 7800": "atari-7800",
|
||||
"Atari - Lynx": "atari-lynx",
|
||||
"Atari - ST": "atari-st",
|
||||
"Coleco - ColecoVision": "coleco-colecovision",
|
||||
"Commodore - Amiga": "commodore-amiga",
|
||||
"Commodore - C128": "commodore-c128",
|
||||
"Dinothawr": "dinothawr",
|
||||
"DOS": "dos",
|
||||
"EPOCH/YENO Super Cassette Vision": "epoch-scv",
|
||||
"Elektronika - BK-0010/BK-0011(M)": "elektronika-bk",
|
||||
"Enterprise - 64/128": "enterprise-64-128",
|
||||
"Fairchild Channel F": "fairchild-channel-f",
|
||||
"Id Software - Doom": "doom",
|
||||
"J2ME": "j2me",
|
||||
"MacII": "apple-macintosh-ii",
|
||||
"Magnavox - Odyssey2": "magnavox-odyssey2",
|
||||
"Mattel - Intellivision": "mattel-intellivision",
|
||||
"Microsoft - MSX": "microsoft-msx",
|
||||
"NEC - PC Engine - TurboGrafx 16 - SuperGrafx": "nec-pc-engine",
|
||||
"NEC - PC-98": "nec-pc-98",
|
||||
"NEC - PC-FX": "nec-pc-fx",
|
||||
"Nintendo - Famicom Disk System": "nintendo-fds",
|
||||
"Nintendo - Game Boy Advance": "nintendo-gba",
|
||||
"Nintendo - GameCube": "nintendo-gamecube",
|
||||
"Nintendo - Gameboy": "nintendo-gb",
|
||||
"Nintendo - Gameboy Color": "nintendo-gbc",
|
||||
"Nintendo - Nintendo 64DD": "nintendo-64dd",
|
||||
"Nintendo - Nintendo DS": "nintendo-ds",
|
||||
"Nintendo - Nintendo Entertainment System": "nintendo-nes",
|
||||
"Nintendo - Pokemon Mini": "nintendo-pokemon-mini",
|
||||
"Nintendo - Satellaview": "nintendo-satellaview",
|
||||
"Nintendo - SuFami Turbo": "nintendo-sufami-turbo",
|
||||
"Nintendo - Super Game Boy": "nintendo-sgb",
|
||||
"Nintendo - Super Nintendo Entertainment System": "nintendo-snes",
|
||||
"Phillips - Videopac+": "philips-videopac",
|
||||
"SNK - NeoGeo CD": "snk-neogeo-cd",
|
||||
"ScummVM": "scummvm",
|
||||
"Sega - Dreamcast": "sega-dreamcast",
|
||||
"Sega - Dreamcast-based Arcade": "sega-dreamcast-arcade",
|
||||
"Sega - Game Gear": "sega-game-gear",
|
||||
"Sega - Master System - Mark III": "sega-master-system",
|
||||
"Sega - Mega CD - Sega CD": "sega-mega-cd",
|
||||
"Sega - Mega Drive - Genesis": "sega-mega-drive",
|
||||
"Sega - Saturn": "sega-saturn",
|
||||
"Sharp - X1": "sharp-x1",
|
||||
"Sharp - X68000": "sharp-x68000",
|
||||
"Sinclair - ZX Spectrum": "sinclair-zx-spectrum",
|
||||
"Sony - PlayStation": "sony-playstation",
|
||||
"Sony - PlayStation 2": "sony-playstation-2",
|
||||
"Sony - PlayStation Portable": "sony-psp",
|
||||
"Texas Instruments TI-83": "ti-83",
|
||||
"Videoton - TV Computer": "videoton-tvc",
|
||||
"Wolfenstein 3D": "wolfenstein-3d",
|
||||
}
|
||||
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper for libretro System.dat."""
|
||||
|
||||
def __init__(self, url: str = SOURCE_URL):
|
||||
self.url = url
|
||||
self._raw_data: str | None = None
|
||||
|
||||
def _fetch_raw(self) -> str:
|
||||
"""Fetch raw DAT content from source URL."""
|
||||
if self._raw_data is not None:
|
||||
return self._raw_data
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
self._raw_data = resp.read().decode("utf-8")
|
||||
return self._raw_data
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
|
||||
|
||||
def fetch_requirements(self) -> list[BiosRequirement]:
|
||||
"""Parse System.dat and return BIOS requirements."""
|
||||
raw = self._fetch_raw()
|
||||
|
||||
if not self.validate_format(raw):
|
||||
raise ValueError("System.dat format validation failed")
|
||||
|
||||
roms = parse_dat(raw)
|
||||
requirements = []
|
||||
|
||||
for rom in roms:
|
||||
system_slug = SYSTEM_SLUG_MAP.get(rom.system, rom.system.lower().replace(" ", "-"))
|
||||
|
||||
destination = rom.name
|
||||
name = rom.name.split("/")[-1] if "/" in rom.name else rom.name
|
||||
|
||||
requirements.append(BiosRequirement(
|
||||
name=name,
|
||||
system=system_slug,
|
||||
sha1=rom.sha1 or None,
|
||||
md5=rom.md5 or None,
|
||||
crc32=rom.crc32 or None,
|
||||
size=rom.size or None,
|
||||
destination=destination,
|
||||
required=True,
|
||||
))
|
||||
|
||||
return requirements
|
||||
|
||||
def validate_format(self, raw_data: str) -> bool:
|
||||
"""Validate System.dat format."""
|
||||
return validate_dat_format(raw_data)
|
||||
|
||||
def fetch_metadata(self) -> dict:
|
||||
"""Fetch version info from System.dat header and GitHub API."""
|
||||
raw = self._fetch_raw()
|
||||
meta = parse_dat_metadata(raw)
|
||||
|
||||
retroarch_version = fetch_github_latest_version("libretro/RetroArch")
|
||||
db_version = fetch_github_latest_version("libretro/libretro-database")
|
||||
|
||||
return {
|
||||
"dat_version": meta.version,
|
||||
"retroarch_version": retroarch_version,
|
||||
"db_version": db_version,
|
||||
}
|
||||
|
||||
def _fetch_core_metadata(self) -> dict[str, dict]:
|
||||
"""Fetch per-core metadata from libretro-core-info .info files."""
|
||||
metadata = {}
|
||||
try:
|
||||
url = f"https://api.github.com/repos/libretro/libretro-core-info/git/trees/master?recursive=1"
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": "retrobios-scraper/1.0",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
import json
|
||||
tree = json.loads(resp.read())
|
||||
|
||||
info_files = [
|
||||
item["path"] for item in tree.get("tree", [])
|
||||
if item["path"].endswith("_libretro.info")
|
||||
]
|
||||
|
||||
for filename in info_files:
|
||||
core_name = filename.replace("_libretro.info", "")
|
||||
try:
|
||||
info_url = f"https://raw.githubusercontent.com/libretro/libretro-core-info/master/{filename}"
|
||||
req = urllib.request.Request(info_url, headers={"User-Agent": "retrobios-scraper/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
content = resp.read().decode("utf-8")
|
||||
|
||||
info = {}
|
||||
for line in content.split("\n"):
|
||||
line = line.strip()
|
||||
if " = " in line:
|
||||
key, _, value = line.partition(" = ")
|
||||
info[key.strip()] = value.strip().strip('"')
|
||||
|
||||
fw_count = int(info.get("firmware_count", "0"))
|
||||
if fw_count == 0:
|
||||
continue
|
||||
|
||||
system_name = info.get("systemname", "")
|
||||
manufacturer = info.get("manufacturer", "")
|
||||
display_name = info.get("display_name", "")
|
||||
categories = info.get("categories", "")
|
||||
|
||||
# Map core to our system slug via firmware paths
|
||||
from .coreinfo_scraper import CORE_SYSTEM_MAP
|
||||
system_slug = CORE_SYSTEM_MAP.get(core_name)
|
||||
if not system_slug:
|
||||
continue
|
||||
|
||||
if system_slug not in metadata:
|
||||
metadata[system_slug] = {
|
||||
"core": core_name,
|
||||
"manufacturer": manufacturer,
|
||||
"display_name": display_name or system_name,
|
||||
"docs": f"https://docs.libretro.com/library/{core_name}/",
|
||||
}
|
||||
except (urllib.error.URLError, urllib.error.HTTPError):
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return metadata
|
||||
|
||||
def generate_platform_yaml(self) -> dict:
|
||||
"""Generate a platform YAML config dict, merging System.dat with core-info metadata."""
|
||||
requirements = self.fetch_requirements()
|
||||
metadata = self.fetch_metadata()
|
||||
core_meta = self._fetch_core_metadata()
|
||||
|
||||
systems = {}
|
||||
for req in requirements:
|
||||
if req.system not in systems:
|
||||
system_entry = {"files": []}
|
||||
if req.system in core_meta:
|
||||
cm = core_meta[req.system]
|
||||
if cm.get("core"):
|
||||
system_entry["core"] = cm["core"]
|
||||
if cm.get("manufacturer"):
|
||||
system_entry["manufacturer"] = cm["manufacturer"]
|
||||
if cm.get("docs"):
|
||||
system_entry["docs"] = cm["docs"]
|
||||
systems[req.system] = system_entry
|
||||
|
||||
entry = {
|
||||
"name": req.name,
|
||||
"destination": req.destination,
|
||||
"required": req.required,
|
||||
}
|
||||
if req.sha1:
|
||||
entry["sha1"] = req.sha1
|
||||
if req.md5:
|
||||
entry["md5"] = req.md5
|
||||
if req.crc32:
|
||||
entry["crc32"] = req.crc32
|
||||
if req.size:
|
||||
entry["size"] = req.size
|
||||
|
||||
systems[req.system]["files"].append(entry)
|
||||
|
||||
return {
|
||||
"platform": "RetroArch",
|
||||
"version": metadata["retroarch_version"] or "",
|
||||
"dat_version": metadata["dat_version"] or "",
|
||||
"homepage": "https://www.retroarch.com",
|
||||
"source": "https://github.com/libretro/libretro-database/blob/master/dat/System.dat",
|
||||
"base_destination": "system",
|
||||
"hash_type": "sha1",
|
||||
"verification_mode": "existence",
|
||||
"systems": systems,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for testing."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Scrape libretro System.dat")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Just show what would be scraped")
|
||||
parser.add_argument("--output", "-o", help="Output YAML file")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
scraper = Scraper()
|
||||
|
||||
try:
|
||||
reqs = scraper.fetch_requirements()
|
||||
except (ConnectionError, ValueError) as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
by_system = {}
|
||||
for req in reqs:
|
||||
by_system.setdefault(req.system, []).append(req)
|
||||
|
||||
for system, files in sorted(by_system.items()):
|
||||
print(f"\n{system} ({len(files)} files):")
|
||||
for f in files:
|
||||
hash_info = f.sha1[:12] if f.sha1 else f.md5[:12] if f.md5 else "no-hash"
|
||||
print(f" {f.name} ({f.size or '?'} bytes, {hash_info}...)")
|
||||
|
||||
print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
return
|
||||
|
||||
if args.json:
|
||||
config = scraper.generate_platform_yaml()
|
||||
print(json.dumps(config, indent=2))
|
||||
return
|
||||
|
||||
if args.output:
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
config = scraper.generate_platform_yaml()
|
||||
with open(args.output, "w") as f:
|
||||
yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
print(f"Written to {args.output}")
|
||||
else:
|
||||
reqs = scraper.fetch_requirements()
|
||||
by_system = {}
|
||||
for req in reqs:
|
||||
by_system.setdefault(req.system, []).append(req)
|
||||
print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
285
scripts/scraper/recalbox_scraper.py
Normal file
285
scripts/scraper/recalbox_scraper.py
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Scraper for Recalbox BIOS requirements.
|
||||
|
||||
Source: https://gitlab.com/recalbox/recalbox/-/raw/master/board/recalbox/fsoverlay/recalbox/share_init/system/.emulationstation/es_bios.xml
|
||||
Format: XML (es_bios.xml)
|
||||
Hash: MD5 (multiple valid hashes per entry, comma-separated)
|
||||
|
||||
Recalbox verification logic:
|
||||
- Checks MD5 of file on disk against list of valid hashes
|
||||
- Multiple MD5s accepted per BIOS (different ROM revisions)
|
||||
- Alternate file paths (pipe-separated)
|
||||
- hashMatchMandatory flag: if false, wrong hash = warning (YELLOW) not error (RED)
|
||||
- ZIP files get composite MD5 calculation
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
|
||||
|
||||
PLATFORM_NAME = "recalbox"
|
||||
|
||||
SOURCE_URL = (
|
||||
"https://gitlab.com/recalbox/recalbox/-/raw/master/"
|
||||
"board/recalbox/fsoverlay/recalbox/share_init/system/"
|
||||
".emulationstation/es_bios.xml"
|
||||
)
|
||||
|
||||
SYSTEM_SLUG_MAP = {
|
||||
"3do": "3do",
|
||||
"amiga600": "commodore-amiga",
|
||||
"amiga1200": "commodore-amiga",
|
||||
"amigacd32": "commodore-amiga",
|
||||
"amigacdtv": "commodore-amiga",
|
||||
"amstradcpc": "amstrad-cpc",
|
||||
"atari800": "atari-400-800",
|
||||
"atari5200": "atari-5200",
|
||||
"atari7800": "atari-7800",
|
||||
"atarilynx": "atari-lynx",
|
||||
"atarist": "atari-st",
|
||||
"c64": "commodore-c64",
|
||||
"channelf": "fairchild-channel-f",
|
||||
"colecovision": "coleco-colecovision",
|
||||
"dreamcast": "sega-dreamcast",
|
||||
"fds": "nintendo-fds",
|
||||
"gamecube": "nintendo-gamecube",
|
||||
"gamegear": "sega-game-gear",
|
||||
"gb": "nintendo-gb",
|
||||
"gba": "nintendo-gba",
|
||||
"gbc": "nintendo-gbc",
|
||||
"intellivision": "mattel-intellivision",
|
||||
"jaguar": "atari-jaguar",
|
||||
"mastersystem": "sega-master-system",
|
||||
"megadrive": "sega-mega-drive",
|
||||
"msx": "microsoft-msx",
|
||||
"msx1": "microsoft-msx",
|
||||
"msx2": "microsoft-msx",
|
||||
"n64": "nintendo-64",
|
||||
"naomi": "sega-dreamcast-arcade",
|
||||
"naomigd": "sega-dreamcast-arcade",
|
||||
"atomiswave": "sega-dreamcast-arcade",
|
||||
"nds": "nintendo-ds",
|
||||
"neogeo": "snk-neogeo",
|
||||
"neogeocd": "snk-neogeo-cd",
|
||||
"o2em": "magnavox-odyssey2",
|
||||
"pcengine": "nec-pc-engine",
|
||||
"pcenginecd": "nec-pc-engine",
|
||||
"pcfx": "nec-pc-fx",
|
||||
"ps2": "sony-playstation-2",
|
||||
"psx": "sony-playstation",
|
||||
"saturn": "sega-saturn",
|
||||
"scummvm": "scummvm",
|
||||
"segacd": "sega-mega-cd",
|
||||
"snes": "nintendo-snes",
|
||||
"supergrafx": "nec-pc-engine",
|
||||
"x68000": "sharp-x68000",
|
||||
"zxspectrum": "sinclair-zx-spectrum",
|
||||
}
|
||||
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper for Recalbox es_bios.xml."""
|
||||
|
||||
def __init__(self, url: str = SOURCE_URL):
|
||||
self.url = url
|
||||
self._raw_data: str | None = None
|
||||
|
||||
def _fetch_raw(self) -> str:
|
||||
if self._raw_data is not None:
|
||||
return self._raw_data
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
self._raw_data = resp.read().decode("utf-8")
|
||||
return self._raw_data
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
|
||||
|
||||
def fetch_requirements(self) -> list[BiosRequirement]:
|
||||
"""Parse es_bios.xml and return BIOS requirements."""
|
||||
raw = self._fetch_raw()
|
||||
|
||||
if not self.validate_format(raw):
|
||||
raise ValueError("es_bios.xml format validation failed")
|
||||
|
||||
root = ET.fromstring(raw)
|
||||
requirements = []
|
||||
seen = set()
|
||||
|
||||
for system_elem in root.findall(".//system"):
|
||||
platform = system_elem.get("platform", "")
|
||||
system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
|
||||
|
||||
for bios_elem in system_elem.findall("bios"):
|
||||
paths_str = bios_elem.get("path", "")
|
||||
md5_str = bios_elem.get("md5", "")
|
||||
core = bios_elem.get("core", "")
|
||||
mandatory = bios_elem.get("mandatory", "true") != "false"
|
||||
hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
|
||||
note = bios_elem.get("note", "")
|
||||
|
||||
paths = [p.strip() for p in paths_str.split("|") if p.strip()]
|
||||
if not paths:
|
||||
continue
|
||||
|
||||
primary_path = paths[0]
|
||||
name = primary_path.split("/")[-1] if "/" in primary_path else primary_path
|
||||
|
||||
md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
|
||||
all_md5 = ",".join(md5_list) if md5_list else None
|
||||
|
||||
dedup_key = primary_path
|
||||
if dedup_key in seen:
|
||||
continue
|
||||
seen.add(dedup_key)
|
||||
|
||||
requirements.append(BiosRequirement(
|
||||
name=name,
|
||||
system=system_slug,
|
||||
md5=all_md5,
|
||||
destination=primary_path,
|
||||
required=mandatory,
|
||||
))
|
||||
|
||||
return requirements
|
||||
|
||||
def fetch_full_requirements(self) -> list[dict]:
|
||||
"""Parse es_bios.xml preserving all Recalbox-specific fields."""
|
||||
raw = self._fetch_raw()
|
||||
root = ET.fromstring(raw)
|
||||
requirements = []
|
||||
|
||||
for system_elem in root.findall(".//system"):
|
||||
platform = system_elem.get("platform", "")
|
||||
system_name = system_elem.get("name", platform)
|
||||
system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
|
||||
|
||||
for bios_elem in system_elem.findall("bios"):
|
||||
paths_str = bios_elem.get("path", "")
|
||||
md5_str = bios_elem.get("md5", "")
|
||||
core = bios_elem.get("core", "")
|
||||
mandatory = bios_elem.get("mandatory", "true") != "false"
|
||||
hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
|
||||
note = bios_elem.get("note", "")
|
||||
|
||||
paths = [p.strip() for p in paths_str.split("|") if p.strip()]
|
||||
md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
|
||||
|
||||
if not paths:
|
||||
continue
|
||||
|
||||
name = paths[0].split("/")[-1] if "/" in paths[0] else paths[0]
|
||||
|
||||
requirements.append({
|
||||
"name": name,
|
||||
"system": system_slug,
|
||||
"system_name": system_name,
|
||||
"paths": paths,
|
||||
"md5_list": md5_list,
|
||||
"core": core,
|
||||
"mandatory": mandatory,
|
||||
"hash_match_mandatory": hash_match_mandatory,
|
||||
"note": note,
|
||||
})
|
||||
|
||||
return requirements
|
||||
|
||||
def validate_format(self, raw_data: str) -> bool:
|
||||
"""Validate es_bios.xml format."""
|
||||
return "<biosList" in raw_data and "<system" in raw_data and "<bios" in raw_data
|
||||
|
||||
def generate_platform_yaml(self) -> dict:
|
||||
"""Generate a platform YAML config dict from scraped data."""
|
||||
requirements = self.fetch_requirements()
|
||||
|
||||
systems = {}
|
||||
for req in requirements:
|
||||
if req.system not in systems:
|
||||
systems[req.system] = {"files": []}
|
||||
|
||||
entry = {
|
||||
"name": req.name,
|
||||
"destination": req.destination,
|
||||
"required": req.required,
|
||||
}
|
||||
if req.md5:
|
||||
entry["md5"] = req.md5
|
||||
|
||||
systems[req.system]["files"].append(entry)
|
||||
|
||||
version = fetch_github_latest_tag("recalbox/recalbox", prefix="") or ""
|
||||
# Recalbox uses GitLab - GitHub API may not resolve
|
||||
if not version:
|
||||
version = "10.0"
|
||||
|
||||
return {
|
||||
"platform": "Recalbox",
|
||||
"version": version,
|
||||
"homepage": "https://www.recalbox.com",
|
||||
"source": SOURCE_URL,
|
||||
"base_destination": "bios",
|
||||
"hash_type": "md5",
|
||||
"verification_mode": "md5",
|
||||
"systems": systems,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Scrape Recalbox es_bios.xml")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--json", action="store_true")
|
||||
parser.add_argument("--full", action="store_true", help="Show full Recalbox-specific fields")
|
||||
parser.add_argument("--output", "-o")
|
||||
args = parser.parse_args()
|
||||
|
||||
scraper = Scraper()
|
||||
|
||||
try:
|
||||
if args.full:
|
||||
reqs = scraper.fetch_full_requirements()
|
||||
print(json.dumps(reqs[:5], indent=2))
|
||||
print(f"\nTotal: {len(reqs)} BIOS entries")
|
||||
return
|
||||
reqs = scraper.fetch_requirements()
|
||||
except (ConnectionError, ValueError) as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
from collections import defaultdict
|
||||
by_system = defaultdict(list)
|
||||
for r in reqs:
|
||||
by_system[r.system].append(r)
|
||||
for sys_name, files in sorted(by_system.items()):
|
||||
print(f"\n{sys_name} ({len(files)} files):")
|
||||
for f in files[:5]:
|
||||
print(f" {f.name} (md5={f.md5[:12] if f.md5 else 'N/A'}...)")
|
||||
if len(files) > 5:
|
||||
print(f" ... +{len(files)-5} more")
|
||||
print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
return
|
||||
|
||||
if args.json:
|
||||
config = scraper.generate_platform_yaml()
|
||||
print(json.dumps(config, indent=2))
|
||||
return
|
||||
|
||||
reqs = scraper.fetch_requirements()
|
||||
by_system = {}
|
||||
for r in reqs:
|
||||
by_system.setdefault(r.system, []).append(r)
|
||||
print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue