Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions
--- a/tests/test_archives.py
+++ b/tests/test_archives.py
@@ -7,12 +7,16 @@ Run with:  pytest tests/ -m network
 Skip with: pytest tests/ -m "not network"  (default in presubmit)
 """

+import re
+
 import pytest

 from models import CandidateRecord
-from plugins.archives.html_scraper import HtmlScraperPlugin
+from plugins.archives.alib import AlibPlugin
 from plugins.archives.openlibrary import OpenLibraryPlugin
 from plugins.archives.rsl import RSLPlugin
+from plugins.archives.rusneb import RusnebPlugin
+from plugins.archives.shpl import ShplPlugin
 from plugins.archives.sru_catalog import SRUCatalogPlugin
 from plugins.rate_limiter import RateLimiter

@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
 _RL = RateLimiter()
 _TIMEOUT = 15

+_YEAR_PAT = re.compile(r"^\d{4}$")
+

 def _titles(results: list[CandidateRecord]) -> list[str]:
    return [r["title"] for r in results]
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
    return [r["author"] for r in results]


+def _years(results: list[CandidateRecord]) -> list[str]:
+    return [r["year"] for r in results]
+
+
 def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
    """Return True if any result title contains fragment (case-insensitive)."""
    low = fragment.lower()
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
    return any(low in r["author"].lower() for r in results)


+def _valid_year(year: str) -> bool:
+    """Return True if year is a 4-digit string or empty."""
+    return year == "" or bool(_YEAR_PAT.match(year))
+
+
 # ── OpenLibrary ───────────────────────────────────────────────────────────────


@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
    assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
    # OpenLibrary stores authors in their original language; accept both forms.
    assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    # OpenLibrary returns isbn and publisher from its JSON API.
+    assert all(isinstance(r["isbn"], str) for r in results)
+    assert all(isinstance(r["publisher"], str) for r in results)


 # ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
    assert results, "RSL returned no results"
    assert all(r["source"] == "rsl" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)


 # ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────


 def test_rusneb_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = RusnebPlugin(
        plugin_id="rusneb",
        name="НЭБ",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=True,
        timeout=_TIMEOUT,
-        config={
-            "url": "https://rusneb.ru/search/",
-            "search_param": "q",
-            "img_alt": True,
-            "author_class": "search-list__item_subtext",
-        },
+        config={},
    )
    results = plugin.search("Война и мир Толстой")
    assert results, "НЭБ returned no results"
    assert all(r["source"] == "rusneb" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
    assert _has_author(results, "толст"), f"authors={_authors(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)


 # ── Alib ─────────────────────────────────────────────────────────────────────


 def test_alib_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = AlibPlugin(
        plugin_id="alib_web",
        name="Alib (web)",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=False,
        timeout=_TIMEOUT,
-        config={
-            "url": "https://www.alib.ru/find3.php4",
-            "search_param": "tfind",
-            "extra_params": {"f": "5", "s": "0"},
-            "encoding": "cp1251",
-            "bold_text": True,
-        },
+        config={},
    )
    results = plugin.search("Война и мир Толстой")
    assert results, "Alib returned no results"
    assert all(r["source"] == "alib_web" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
    assert _has_author(results, "толст"), f"authors={_authors(results)}"
+    # Alib entries always include a publication year in the bibliographic text.
+    assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)


 # ── НЛР (SRU) ────────────────────────────────────────────────────────────────
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
    assert results, "НЛР returned no results"
    assert all(r["source"] == "nlr" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)


 # ── ШПИЛ ─────────────────────────────────────────────────────────────────────
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:

@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
 def test_shpl_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = ShplPlugin(
        plugin_id="shpl",
        name="ШПИЛ",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=False,
        timeout=_TIMEOUT,
-        config={
-            "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
-            "search_param": "S21ALL",
-            "extra_params": {
-                "C21COM": "S",
-                "I21DBN": "BIBL",
-                "P21DBN": "BIBL",
-                "S21FMT": "briefWebRus",
-                "Z21ID": "",
-            },
-            "brief_class": "brief",
-        },
+        config={},
    )
    results = plugin.search("Война и мир")
    assert results, "ШПИЛ returned no results"
    assert all(r["source"] == "shpl" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)