Fix archive plugins for НЭБ and Alib; add network integration tests

- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text
  strategy (Alib entries from <p><b>), Windows-1251 encoding support,
  _cls_inner_texts() helper that strips inner HTML tags
- rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL
  title:(words) AND author:(word) query format
- config: update rusneb (img_alt + correct author_class) and alib_web
  (encoding + bold_text) to match fixed plugin strategies
- tests: add tests/test_archives.py with network-marked tests for all six
  archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404)
- presubmit: exclude network tests from default run (-m "not network")

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 22:59:19 +03:00
parent ce03046e51
commit b8f82607f9
6 changed files with 458 additions and 42 deletions

189
tests/test_archives.py Normal file
View File

@@ -0,0 +1,189 @@
"""Network integration tests for archive searcher plugins.
Each test queries a live external service for "War and Peace" by Tolstoy,
a book universally catalogued in all supported archives.
Run with: pytest tests/ -m network
Skip with: pytest tests/ -m "not network" (default in presubmit)
"""
import pytest
from models import CandidateRecord
from plugins.archives.html_scraper import HtmlScraperPlugin
from plugins.archives.openlibrary import OpenLibraryPlugin
from plugins.archives.rsl import RSLPlugin
from plugins.archives.sru_catalog import SRUCatalogPlugin
from plugins.rate_limiter import RateLimiter
pytestmark = pytest.mark.network
_RL = RateLimiter()
_TIMEOUT = 15
def _titles(results: list[CandidateRecord]) -> list[str]:
return [r["title"] for r in results]
def _authors(results: list[CandidateRecord]) -> list[str]:
return [r["author"] for r in results]
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result title contains fragment (case-insensitive)."""
low = fragment.lower()
return any(low in r["title"].lower() for r in results)
def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result author contains fragment (case-insensitive)."""
low = fragment.lower()
return any(low in r["author"].lower() for r in results)
# ── OpenLibrary ───────────────────────────────────────────────────────────────
def test_openlibrary_war_and_peace() -> None:
plugin = OpenLibraryPlugin(
plugin_id="openlibrary",
name="OpenLibrary",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("War and Peace Tolstoy")
assert results, "OpenLibrary returned no results"
assert all(r["source"] == "openlibrary" for r in results)
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
# OpenLibrary stores authors in their original language; accept both forms.
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
def test_rsl_voina_i_mir() -> None:
plugin = RSLPlugin(
plugin_id="rsl",
name="РГБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("Толстой Война и мир")
assert results, "RSL returned no results"
assert all(r["source"] == "rsl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
def test_rusneb_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin_id="rusneb",
name="НЭБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={
"url": "https://rusneb.ru/search/",
"search_param": "q",
"img_alt": True,
"author_class": "search-list__item_subtext",
},
)
results = plugin.search("Война и мир Толстой")
assert results, "НЭБ returned no results"
assert all(r["source"] == "rusneb" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
# ── Alib ─────────────────────────────────────────────────────────────────────
def test_alib_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin_id="alib_web",
name="Alib (web)",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.alib.ru/find3.php4",
"search_param": "tfind",
"extra_params": {"f": "5", "s": "0"},
"encoding": "cp1251",
"bold_text": True,
},
)
results = plugin.search("Война и мир Толстой")
assert results, "Alib returned no results"
assert all(r["source"] == "alib_web" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
def test_nlr_voina_i_mir() -> None:
plugin = SRUCatalogPlugin(
plugin_id="nlr",
name="НЛР",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "http://www.nlr.ru/search/query",
"query_prefix": "title=",
},
)
results = plugin.search("Война и мир")
assert results, "НЛР returned no results"
assert all(r["source"] == "nlr" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
def test_shpl_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin_id="shpl",
name="ШПИЛ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
"search_param": "S21ALL",
"extra_params": {
"C21COM": "S",
"I21DBN": "BIBL",
"P21DBN": "BIBL",
"S21FMT": "briefWebRus",
"Z21ID": "",
},
"brief_class": "brief",
},
)
results = plugin.search("Война и мир")
assert results, "ШПИЛ returned no results"
assert all(r["source"] == "shpl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"