Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions

View File

@@ -7,12 +7,16 @@ Run with: pytest tests/ -m network
Skip with: pytest tests/ -m "not network" (default in presubmit)
"""
import re
import pytest
from models import CandidateRecord
from plugins.archives.html_scraper import HtmlScraperPlugin
from plugins.archives.alib import AlibPlugin
from plugins.archives.openlibrary import OpenLibraryPlugin
from plugins.archives.rsl import RSLPlugin
from plugins.archives.rusneb import RusnebPlugin
from plugins.archives.shpl import ShplPlugin
from plugins.archives.sru_catalog import SRUCatalogPlugin
from plugins.rate_limiter import RateLimiter
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
_RL = RateLimiter()
_TIMEOUT = 15
_YEAR_PAT = re.compile(r"^\d{4}$")
def _titles(results: list[CandidateRecord]) -> list[str]:
return [r["title"] for r in results]
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
return [r["author"] for r in results]
def _years(results: list[CandidateRecord]) -> list[str]:
return [r["year"] for r in results]
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result title contains fragment (case-insensitive)."""
low = fragment.lower()
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
return any(low in r["author"].lower() for r in results)
def _valid_year(year: str) -> bool:
"""Return True if year is a 4-digit string or empty."""
return year == "" or bool(_YEAR_PAT.match(year))
# ── OpenLibrary ───────────────────────────────────────────────────────────────
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
# OpenLibrary stores authors in their original language; accept both forms.
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
# OpenLibrary returns isbn and publisher from its JSON API.
assert all(isinstance(r["isbn"], str) for r in results)
assert all(isinstance(r["publisher"], str) for r in results)
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
assert results, "RSL returned no results"
assert all(r["source"] == "rsl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
def test_rusneb_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = RusnebPlugin(
plugin_id="rusneb",
name="НЭБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={
"url": "https://rusneb.ru/search/",
"search_param": "q",
"img_alt": True,
"author_class": "search-list__item_subtext",
},
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "НЭБ returned no results"
assert all(r["source"] == "rusneb" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── Alib ─────────────────────────────────────────────────────────────────────
def test_alib_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = AlibPlugin(
plugin_id="alib_web",
name="Alib (web)",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.alib.ru/find3.php4",
"search_param": "tfind",
"extra_params": {"f": "5", "s": "0"},
"encoding": "cp1251",
"bold_text": True,
},
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "Alib returned no results"
assert all(r["source"] == "alib_web" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
# Alib entries always include a publication year in the bibliographic text.
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
assert results, "НЛР returned no results"
assert all(r["source"] == "nlr" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
def test_shpl_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = ShplPlugin(
plugin_id="shpl",
name="ШПИЛ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
"search_param": "S21ALL",
"extra_params": {
"C21COM": "S",
"I21DBN": "BIBL",
"P21DBN": "BIBL",
"S21FMT": "briefWebRus",
"Z21ID": "",
},
"brief_class": "brief",
},
config={},
)
results = plugin.search("Война и мир")
assert results, "ШПИЛ returned no results"
assert all(r["source"] == "shpl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)