Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -1,27 +1,17 @@
-"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
+"""Base class and shared HTML parsing utilities for archive scraper plugins."""

 import re
 from typing import Any
-from urllib.parse import quote, urlparse
-
-import httpx

 from models import CandidateRecord

 from ..rate_limiter import RateLimiter

-_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
-
-# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
-_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
+YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)


-def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
-    # Support both single and double-quoted class attributes.
-    return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
-
-
-def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
+def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
    """Extract text content from elements whose class contains cls_frag.

    Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
    return out


-def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
+def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
    """Extract non-empty alt attributes from <img> tags, normalising whitespace.

    Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:


 class HtmlScraperPlugin:
-    """Config-driven HTML scraper.
+    """Base class for HTML-scraping archive plugins.

-    Supported config keys:
-      url               — search URL
-      search_param      — query param name
-      extra_params      — dict of fixed extra query parameters
-      encoding          — character encoding for query and response (e.g. "cp1251")
-      title_class       — CSS class fragment for title elements (class-based strategy)
-      author_class      — CSS class fragment for author elements
-      link_href_pattern — href regex to find title <a> links (link strategy)
-      brief_class       — CSS class for brief record rows (brief strategy)
-      img_alt           — truthy: extract titles from <img alt> attributes (rusneb strategy)
-      bold_text         — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
+    Handles common initialisation; subclasses implement search() with
+    site-specific hardcoded logic.  The config dict is accepted for
+    registry compatibility but is not used by the base class; all scraping
+    details are hardcoded in the subclass.
    """

    category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
        self.rate_limit_seconds = rate_limit_seconds
        self.auto_queue = auto_queue
        self.timeout = timeout
-        self.config = config
-        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id

    def search(self, query: str) -> list[CandidateRecord]:
        """Search for books matching query.

        Args:
-            query: Free-text search string (author, title, keywords).
+            query: Free-text search string.

        Returns:
            Up to three CandidateRecord dicts with source, title, author, year,
            isbn, and publisher fields.
        """
-        cfg = self.config
-        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-
-        encoding = str(cfg.get("encoding") or "")
-        if encoding:
-            # Encode query and extra params in the site's native encoding.
-            q_enc = quote(query.encode(encoding, "replace"))
-            ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
-            raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
-            r = httpx.get(
-                f'{cfg["url"]}?{raw_qs}',
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.content.decode(encoding, errors="replace")
-        else:
-            params: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            params[cfg["search_param"]] = query
-            r = httpx.get(
-                cfg["url"],
-                params=params,
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.text
-
-        years = _YEAR_RE.findall(html)
-
-        if cfg.get("bold_text"):
-            return self._parse_bold_text(html, years)
-        if cfg.get("img_alt"):
-            return self._parse_img_alt(html, years, cfg)
-        if "link_href_pattern" in cfg:
-            return self._parse_link(html, years, cfg)
-        if "brief_class" in cfg:
-            return self._parse_brief(html, years, cfg)
-        return self._parse_class(html, years, cfg)
-
-    def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
-        """Extract records from ``<p><b>text</b>`` entries (Alib-style).
-
-        The bold text is expected to begin with ``Surname I.N. Title…``; the
-        author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
-        out: list[CandidateRecord] = []
-        for i, entry in enumerate(entries):
-            text = entry.strip()
-            m = _AUTHOR_PREFIX_PAT.match(text)
-            if m:
-                author = m.group(1).strip()
-                title = m.group(2).strip()
-            else:
-                author = ""
-                title = text
-            out.append(
-                CandidateRecord(
-                    source=self.plugin_id,
-                    title=title,
-                    author=author,
-                    year=years[i] if i < len(years) else "",
-                    isbn="",
-                    publisher="",
-                )
-            )
-        return out
-
-    def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        """Extract records using ``<img alt>`` for titles and a CSS class for authors.
-
-        Used for sites like rusneb.ru where thumbnail alt attributes carry the
-        book title and a separate span contains the author.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-            cfg: Plugin config dict (reads ``author_class``).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        titles = _img_alts(html)
-        authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title,
-                author=authors[i] if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        href_pat = cfg.get("link_href_pattern", r"")
-        titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=t.strip(),
-                author="",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, t in enumerate(titles)
-        ]
+        raise NotImplementedError