Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions
--- a/src/plugins/archives/shpl.py
+++ b/src/plugins/archives/shpl.py
@@ -0,0 +1,63 @@
+"""ШПИЛ archive search plugin.
+
+Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
+produces no results.  The class is retained so the configuration entry can
+be re-enabled if the endpoint is restored.
+"""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
+_DOMAIN = "www.shpl.ru"
+_EXTRA_PARAMS: dict[str, str] = {
+    "C21COM": "S",
+    "I21DBN": "BIBL",
+    "P21DBN": "BIBL",
+    "S21FMT": "briefWebRus",
+    "Z21ID": "",
+}
+
+_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
+
+
+class ShplPlugin(HtmlScraperPlugin):
+    """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
+
+    Extracts brief record entries from elements with class ``brief``.
+    The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search ШПИЛ for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        params: dict[str, str] = dict(_EXTRA_PARAMS)
+        params["S21ALL"] = query
+        r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.text
+        years = YEAR_RE.findall(html)
+        titles = _BRIEF_RE.findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=t.strip(),
+                author="",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, t in enumerate(titles)
+        ]