From 08f68e792de2b6ffe89b34f4f95a9be7612ae95e Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:51:12 +0100 Subject: [PATCH] refactor: centralize hash logic, fix circular imports and perf bottlenecks --- database.json | 16 ++++---- scripts/common.py | 37 +++++------------ scripts/cross_reference.py | 5 +-- scripts/download.py | 15 ++----- scripts/generate_pack.py | 35 +++++++--------- scripts/generate_readme.py | 22 +++++++++- scripts/generate_site.py | 23 +++++++++-- scripts/scraper/base_scraper.py | 2 +- scripts/scraper/batocera_scraper.py | 26 +++--------- scripts/scraper/recalbox_scraper.py | 1 - scripts/verify.py | 63 ++++++++++++++++++++++------- 11 files changed, 132 insertions(+), 113 deletions(-) diff --git a/database.json b/database.json index 2ed8e957..91bf23c0 100644 --- a/database.json +++ b/database.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-03-18T07:24:56Z", + "generated_at": "2026-03-18T10:49:11Z", "total_files": 5357, "total_size": 4887546948, "files": { @@ -68044,14 +68044,6 @@ "disk2-16boot.rom": [ "d4181c9f046aafc3fb326b381baac809d9e38d16" ], - "tos100uk.img": [ - "da39a3ee5e6b4b0d3255bfef95601890afd80709", - "9a6e4c88533a9eaa4d55cdc040e47443e0226eb2" - ], - "tos206us.img": [ - "da39a3ee5e6b4b0d3255bfef95601890afd80709", - "ee58768bdfc602c9b14942ce5481e97dd24e7c83" - ], "tos102uk.img": [ "87900a40a890fdf03bd08be6c60cc645855cbce5" ], @@ -68952,9 +68944,15 @@ "sega-saturn:0306c0e408d6682dd2d86324bd4ac661": [ "8c031bf9908fd0142fdd10a9cdd79389f8a3f2fc" ], + "tos100uk.img": [ + "9a6e4c88533a9eaa4d55cdc040e47443e0226eb2" + ], "tos106de.img": [ "3b8cf5ffa41b252eb67f8824f94608fa4005d6dd" ], + "tos206us.img": [ + "ee58768bdfc602c9b14942ce5481e97dd24e7c83" + ], "bios7.bin": [ "24f67bdea115a2c847c8813a262502ee1607b7df" ], diff --git a/scripts/common.py b/scripts/common.py index 50cf0649..4eeda274 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -45,12 +45,16 @@ def load_database(db_path: str) -> dict: return json.load(f) -def md5sum(filepath: str | Path) -> str: - """Compute MD5 of a file - matches Batocera's md5sum().""" +def md5sum(source: str | Path | object) -> str: + """Compute MD5 of a file path or file-like object - matches Batocera's md5sum().""" h = hashlib.md5() - with open(filepath, "rb") as f: - for chunk in iter(lambda: f.read(65536), b""): + if hasattr(source, "read"): + for chunk in iter(lambda: source.read(65536), b""): h.update(chunk) + else: + with open(source, "rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + h.update(chunk) return h.hexdigest() @@ -154,13 +158,13 @@ def resolve_local_file( if sha1_match and sha1_match in files_db: path = files_db[sha1_match]["path"] if os.path.exists(path): - return path, "exact" + return path, "md5_exact" if len(md5_candidate) < 32: for db_md5, db_sha1 in by_md5.items(): if db_md5.startswith(md5_candidate) and db_sha1 in files_db: path = files_db[db_sha1]["path"] if os.path.exists(path): - return path, "exact" + return path, "md5_exact" # 3. zipped_file content match via pre-built index if zipped_file and md5_list and zip_contents: @@ -217,27 +221,6 @@ def resolve_local_file( return None, "not_found" -def compute_coverage(platform_name: str, platforms_dir: str, db: dict) -> dict: - """Compute BIOS coverage for a platform using verify logic.""" - from verify import verify_platform - config = load_platform_config(platform_name, platforms_dir) - result = verify_platform(config, db) - present = result["ok"] + result["untested"] - pct = (present / result["total"] * 100) if result["total"] > 0 else 0 - return { - "platform": config.get("platform", platform_name), - "total": result["total"], - "verified": result["ok"], - "untested": result["untested"], - "missing": result["missing"], - "present": present, - "percentage": pct, - "mode": config.get("verification_mode", "existence"), - "details": result["details"], - "config": config, - } - - def safe_extract_zip(zip_path: str, dest_dir: str) -> None: """Extract a ZIP file safely, preventing zip-slip path traversal.""" dest = os.path.realpath(dest_dir) diff --git a/scripts/cross_reference.py b/scripts/cross_reference.py index f2a3db60..8a530bd9 100644 --- a/scripts/cross_reference.py +++ b/scripts/cross_reference.py @@ -25,7 +25,7 @@ except ImportError: sys.exit(1) sys.path.insert(0, os.path.dirname(__file__)) -from common import load_database +from common import load_database, load_platform_config DEFAULT_EMULATORS_DIR = "emulators" DEFAULT_PLATFORMS_DIR = "platforms" @@ -52,8 +52,7 @@ def load_platform_files(platforms_dir: str) -> dict[str, set[str]]: for f in sorted(Path(platforms_dir).glob("*.yml")): if f.name.startswith("_"): continue - with open(f) as fh: - config = yaml.safe_load(fh) or {} + config = load_platform_config(f.stem, platforms_dir) for sys_id, system in config.get("systems", {}).items(): for fe in system.get("files", []): name = fe.get("name", "") diff --git a/scripts/download.py b/scripts/download.py index b09d5607..3e806dc1 100644 --- a/scripts/download.py +++ b/scripts/download.py @@ -13,7 +13,6 @@ Usage: from __future__ import annotations import argparse -import hashlib import json import os import sys @@ -23,7 +22,7 @@ import zipfile from pathlib import Path sys.path.insert(0, os.path.dirname(__file__)) -from common import safe_extract_zip +from common import compute_hashes, safe_extract_zip GITHUB_API = "https://api.github.com" REPO = "Abdess/retrobios" @@ -135,21 +134,15 @@ def verify_files(platform: str, dest_dir: str, release: dict): found = False for local_file in dest.rglob(name): if local_file.is_file(): - h = hashlib.sha1() - with open(local_file, "rb") as f: - while True: - chunk = f.read(65536) - if not chunk: - break - h.update(chunk) + local_sha1 = compute_hashes(local_file)["sha1"] - if h.hexdigest() == sha1: + if local_sha1 == sha1: verified += 1 found = True break else: mismatched += 1 - print(f" MISMATCH: {name} (expected {sha1[:12]}..., got {h.hexdigest()[:12]}...)") + print(f" MISMATCH: {name} (expected {sha1[:12]}..., got {local_sha1[:12]}...)") found = True break diff --git a/scripts/generate_pack.py b/scripts/generate_pack.py index 2743f598..18c4c019 100644 --- a/scripts/generate_pack.py +++ b/scripts/generate_pack.py @@ -24,7 +24,7 @@ import zipfile from pathlib import Path sys.path.insert(0, os.path.dirname(__file__)) -from common import load_database, load_platform_config, md5_composite, resolve_local_file +from common import compute_hashes, load_database, load_platform_config, md5_composite, resolve_local_file try: import yaml @@ -44,17 +44,12 @@ MAX_ENTRY_SIZE = 512 * 1024 * 1024 # 512MB def _verify_file_hash(path: str, expected_sha1: str = "", expected_md5: str = "") -> bool: - """Compute and compare hash of a local file.""" if not expected_sha1 and not expected_md5: return True - h = hashlib.sha1() if expected_sha1 else hashlib.md5() - with open(path, "rb") as f: - while True: - chunk = f.read(65536) - if not chunk: - break - h.update(chunk) - return h.hexdigest() == (expected_sha1 or expected_md5) + hashes = compute_hashes(path) + if expected_sha1: + return hashes["sha1"] == expected_sha1 + return hashes["md5"] == expected_md5 def fetch_large_file(name: str, dest_dir: str = ".cache/large", @@ -267,26 +262,20 @@ def _load_emulator_extras( def generate_pack( platform_name: str, platforms_dir: str, - db_path: str, + db: dict, bios_dir: str, output_dir: str, include_extras: bool = False, emulators_dir: str = "emulators", + zip_contents: dict | None = None, ) -> str | None: """Generate a ZIP pack for a platform. Returns the path to the generated ZIP, or None on failure. """ config = load_platform_config(platform_name, platforms_dir) - db = load_database(db_path) - - # Only build the expensive ZIP contents index if the platform has zipped_file entries - has_zipped = any( - fe.get("zipped_file") - for sys in config.get("systems", {}).values() - for fe in sys.get("files", []) - ) - zip_contents = build_zip_contents_index(db) if has_zipped else {} + if zip_contents is None: + zip_contents = {} verification_mode = config.get("verification_mode", "existence") platform_display = config.get("platform", platform_name) @@ -468,6 +457,9 @@ def main(): parser.error("Specify --platform or --all") return + db = load_database(args.db) + zip_contents = build_zip_contents_index(db) + groups = _group_identical_platforms(platforms, args.platforms_dir) for group_platforms, representative in groups: @@ -480,8 +472,9 @@ def main(): try: zip_path = generate_pack( - representative, args.platforms_dir, args.db, args.bios_dir, args.output_dir, + representative, args.platforms_dir, db, args.bios_dir, args.output_dir, include_extras=args.include_extras, emulators_dir=args.emulators_dir, + zip_contents=zip_contents, ) if zip_path and len(group_platforms) > 1: # Rename ZIP to include all platform names diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py index 86109ba2..c4985cc9 100644 --- a/scripts/generate_readme.py +++ b/scripts/generate_readme.py @@ -18,7 +18,27 @@ from datetime import datetime, timezone from pathlib import Path sys.path.insert(0, os.path.dirname(__file__)) -from common import load_database, compute_coverage +from common import load_database, load_platform_config +from verify import verify_platform + +def compute_coverage(platform_name: str, platforms_dir: str, db: dict) -> dict: + config = load_platform_config(platform_name, platforms_dir) + result = verify_platform(config, db) + present = result["ok"] + result["untested"] + pct = (present / result["total"] * 100) if result["total"] > 0 else 0 + return { + "platform": config.get("platform", platform_name), + "total": result["total"], + "verified": result["ok"], + "untested": result["untested"], + "missing": result["missing"], + "present": present, + "percentage": pct, + "mode": config.get("verification_mode", "existence"), + "details": result["details"], + "config": config, + } + SITE_URL = "https://abdess.github.io/retrobios/" RELEASE_URL = "../../releases/latest" diff --git a/scripts/generate_site.py b/scripts/generate_site.py index ef7eb0fc..ed9d076b 100644 --- a/scripts/generate_site.py +++ b/scripts/generate_site.py @@ -26,7 +26,8 @@ except ImportError: sys.exit(1) sys.path.insert(0, os.path.dirname(__file__)) -from common import load_database, load_platform_config, compute_coverage +from common import load_database, load_platform_config +from verify import verify_platform DOCS_DIR = "docs" SITE_NAME = "RetroBIOS" @@ -64,9 +65,23 @@ def _status_icon(pct: float) -> str: return "partial" -# --------------------------------------------------------------------------- -# Coverage computation (reuses verify.py logic) -# --------------------------------------------------------------------------- +def compute_coverage(platform_name: str, platforms_dir: str, db: dict) -> dict: + config = load_platform_config(platform_name, platforms_dir) + result = verify_platform(config, db) + present = result["ok"] + result["untested"] + pct = (present / result["total"] * 100) if result["total"] > 0 else 0 + return { + "platform": config.get("platform", platform_name), + "total": result["total"], + "verified": result["ok"], + "untested": result["untested"], + "missing": result["missing"], + "present": present, + "percentage": pct, + "mode": config.get("verification_mode", "existence"), + "details": result["details"], + "config": config, + } # --------------------------------------------------------------------------- diff --git a/scripts/scraper/base_scraper.py b/scripts/scraper/base_scraper.py index b0132d81..62a1c627 100644 --- a/scripts/scraper/base_scraper.py +++ b/scripts/scraper/base_scraper.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import sys import urllib.request import urllib.error from abc import ABC, abstractmethod @@ -168,7 +169,6 @@ def scraper_cli(scraper_class: type, description: str = "Scrape BIOS requirement reqs = scraper.fetch_requirements() except (ConnectionError, ValueError) as e: print(f"Error: {e}", file=sys.stderr) - import sys sys.exit(1) if args.dry_run: diff --git a/scripts/scraper/batocera_scraper.py b/scripts/scraper/batocera_scraper.py index 30a521de..7ecc7773 100644 --- a/scripts/scraper/batocera_scraper.py +++ b/scripts/scraper/batocera_scraper.py @@ -211,28 +211,12 @@ class Scraper(BaseScraper): systems[req.system]["files"].append(entry) - # Sort numerically since API returns by commit date, not version - import json as _json + tag = fetch_github_latest_tag("batocera-linux/batocera.linux", prefix="batocera-") batocera_version = "" - try: - _url = "https://api.github.com/repos/batocera-linux/batocera.linux/tags?per_page=50" - _req = urllib.request.Request(_url, headers={ - "User-Agent": "retrobios-scraper/1.0", - "Accept": "application/vnd.github.v3+json", - }) - with urllib.request.urlopen(_req, timeout=15) as _resp: - _tags = _json.loads(_resp.read()) - _versions = [] - for _t in _tags: - _name = _t["name"] - if _name.startswith("batocera-"): - _num = _name.replace("batocera-", "") - if _num.isdigit(): - _versions.append(int(_num)) - if _versions: - batocera_version = str(max(_versions)) - except (ConnectionError, ValueError, OSError): - pass + if tag: + num = tag.removeprefix("batocera-") + if num.isdigit(): + batocera_version = num return { "platform": "Batocera", diff --git a/scripts/scraper/recalbox_scraper.py b/scripts/scraper/recalbox_scraper.py index cb1ba826..3dc379cd 100644 --- a/scripts/scraper/recalbox_scraper.py +++ b/scripts/scraper/recalbox_scraper.py @@ -262,7 +262,6 @@ def main(): print(json.dumps(config, indent=2)) return - reqs = scraper.fetch_requirements() by_system = {} for r in reqs: by_system.setdefault(r.system, []).append(r) diff --git a/scripts/verify.py b/scripts/verify.py index 4d1ebcce..0e55fa2a 100644 --- a/scripts/verify.py +++ b/scripts/verify.py @@ -54,14 +54,9 @@ def check_inside_zip(container: str, file_name: str, expected_md5: str) -> str: return Status.OK with archive.open(fname) as entry: - h = hashlib.md5() - while True: - block = entry.read(65536) - if not block: - break - h.update(block) + actual = md5sum(entry) - if h.hexdigest() == expected_md5: + if actual == expected_md5: return Status.OK else: return Status.UNTESTED @@ -71,10 +66,13 @@ def check_inside_zip(container: str, file_name: str, expected_md5: str) -> str: return "error" -def resolve_to_local_path(file_entry: dict, db: dict) -> str | None: +def resolve_to_local_path( + file_entry: dict, + db: dict, + zip_contents: dict | None = None, +) -> tuple[str | None, str]: """Find the local file path for a BIOS entry. Delegates to common.resolve_local_file.""" - path, _ = resolve_local_file(file_entry, db) - return path + return resolve_local_file(file_entry, db, zip_contents) def verify_entry_existence(file_entry: dict, local_path: str | None) -> dict: @@ -85,7 +83,11 @@ def verify_entry_existence(file_entry: dict, local_path: str | None) -> dict: return {"name": name, "status": Status.MISSING} -def verify_entry_md5(file_entry: dict, local_path: str | None) -> dict: +def verify_entry_md5( + file_entry: dict, + local_path: str | None, + resolve_status: str = "", +) -> dict: """MD5 verification - supports single MD5 (Batocera) and multi-MD5 (Recalbox).""" name = file_entry.get("name", "") expected_md5 = file_entry.get("md5", "") @@ -125,6 +127,9 @@ def verify_entry_md5(file_entry: dict, local_path: str | None) -> dict: if not md5_list: return {"name": name, "status": Status.OK, "path": local_path} + if resolve_status == "md5_exact": + return {"name": name, "status": Status.OK, "path": local_path} + actual_md5 = md5sum(local_path) # Case-insensitive - Recalbox uses uppercase MD5s @@ -153,6 +158,26 @@ def verify_entry_md5(file_entry: dict, local_path: str | None) -> dict: } +def _build_zip_contents_index(db: dict) -> dict: + """Build index of {inner_rom_md5: zip_file_sha1} for ROMs inside ZIP files.""" + index: dict[str, str] = {} + for sha1, entry in db.get("files", {}).items(): + path = entry["path"] + if not path.endswith(".zip") or not os.path.exists(path): + continue + try: + with zipfile.ZipFile(path, "r") as zf: + for info in zf.infolist(): + if info.is_dir() or info.file_size > 512 * 1024 * 1024: + continue + data = zf.read(info.filename) + inner_md5 = hashlib.md5(data).hexdigest() + index[inner_md5] = sha1 + except (zipfile.BadZipFile, OSError): + continue + return index + + def verify_platform(config: dict, db: dict) -> dict: """Verify all BIOS files for a platform using its verification_mode. @@ -170,13 +195,23 @@ def verify_platform(config: dict, db: dict) -> dict: mode = config.get("verification_mode", "existence") platform = config.get("platform", "unknown") - verify_fn = verify_entry_existence if mode == "existence" else verify_entry_md5 + has_zipped = any( + fe.get("zipped_file") + for sys in config.get("systems", {}).values() + for fe in sys.get("files", []) + ) + zip_contents = _build_zip_contents_index(db) if has_zipped else {} results = [] for sys_id, system in config.get("systems", {}).items(): for file_entry in system.get("files", []): - local_path = resolve_to_local_path(file_entry, db) - result = verify_fn(file_entry, local_path) + local_path, resolve_status = resolve_to_local_path( + file_entry, db, zip_contents, + ) + if mode == "existence": + result = verify_entry_existence(file_entry, local_path) + else: + result = verify_entry_md5(file_entry, local_path, resolve_status) result["system"] = sys_id results.append(result)