refactor: extract _fetch_raw to BaseScraper (DRY)

Identical _fetch_raw() implementation (URL fetch + cache + error handling) was duplicated in 4 scrapers. Moved to BaseScraper.__init__ with url param. Each scraper now passes url to super().__init__() and inherits _fetch_raw(). Eliminates ~48 lines of duplicated code. DRY audit now clean: resolve logic in common.py, scraper CLI in base_scraper, _fetch_raw in BaseScraper. Remaining duplications are justified (different list_platforms semantics, context-specific hash computation).
2026-06-29 05:42:48 +00:00 · 2026-03-18 08:22:21 +01:00 · 2026-03-18 08:22:21 +01:00 · 3de4bf8190
commit 3de4bf8190
parent 2466fc4a97
5 changed files with 22 additions and 53 deletions
--- a/scripts/scraper/base_scraper.py
+++ b/scripts/scraper/base_scraper.py
@ -48,6 +48,24 @@ class ChangeSet:
 class BaseScraper(ABC):
    """Abstract base class for platform BIOS requirement scrapers."""

+    def __init__(self, url: str = ""):
+        self.url = url
+        self._raw_data: str | None = None
+
+    def _fetch_raw(self) -> str:
+        """Fetch raw content from source URL. Cached after first call."""
+        if self._raw_data is not None:
+            return self._raw_data
+        if not self.url:
+            raise ValueError("No source URL configured")
+        try:
+            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                self._raw_data = resp.read().decode("utf-8")
+                return self._raw_data
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
+
    @abstractmethod
    def fetch_requirements(self) -> list[BiosRequirement]:
        """Fetch current BIOS requirements from the platform source."""