"""ШПИЛ archive search plugin. Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin produces no results. The class is retained so the configuration entry can be re-enabled if the endpoint is restored. """ import re import httpx from models import CandidateRecord from .html_scraper import YEAR_RE, HtmlScraperPlugin _URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe" _DOMAIN = "www.shpl.ru" _EXTRA_PARAMS: dict[str, str] = { "C21COM": "S", "I21DBN": "BIBL", "P21DBN": "BIBL", "S21FMT": "briefWebRus", "Z21ID": "", } _BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<') class ShplPlugin(HtmlScraperPlugin): """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека). Extracts brief record entries from elements with class ``brief``. The remote IRBIS64 CGI endpoint is currently offline (HTTP 404). """ def search(self, query: str) -> list[CandidateRecord]: """Search ШПИЛ for books matching query. Args: query: Free-text search string. Returns: Up to three CandidateRecord dicts with source, title, author, year, isbn, and publisher fields. """ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) params: dict[str, str] = dict(_EXTRA_PARAMS) params["S21ALL"] = query r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) html = r.text years = YEAR_RE.findall(html) titles = _BRIEF_RE.findall(html)[:3] return [ CandidateRecord( source=self.plugin_id, title=t.strip(), author="", year=years[i] if i < len(years) else "", isbn="", publisher="", ) for i, t in enumerate(titles) ]