Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
203 lines
7.9 KiB
Python
203 lines
7.9 KiB
Python
"""Network integration tests for archive searcher plugins.
|
|
|
|
Each test queries a live external service for "War and Peace" by Tolstoy,
|
|
a book universally catalogued in all supported archives.
|
|
|
|
Run with: pytest tests/ -m network
|
|
Skip with: pytest tests/ -m "not network" (default in presubmit)
|
|
"""
|
|
|
|
import re
|
|
|
|
import pytest
|
|
|
|
from models import CandidateRecord
|
|
from plugins.archives.alib import AlibPlugin
|
|
from plugins.archives.openlibrary import OpenLibraryPlugin
|
|
from plugins.archives.rsl import RSLPlugin
|
|
from plugins.archives.rusneb import RusnebPlugin
|
|
from plugins.archives.shpl import ShplPlugin
|
|
from plugins.archives.sru_catalog import SRUCatalogPlugin
|
|
from plugins.rate_limiter import RateLimiter
|
|
|
|
pytestmark = pytest.mark.network
|
|
|
|
_RL = RateLimiter()
|
|
_TIMEOUT = 15
|
|
|
|
_YEAR_PAT = re.compile(r"^\d{4}$")
|
|
|
|
|
|
def _titles(results: list[CandidateRecord]) -> list[str]:
|
|
return [r["title"] for r in results]
|
|
|
|
|
|
def _authors(results: list[CandidateRecord]) -> list[str]:
|
|
return [r["author"] for r in results]
|
|
|
|
|
|
def _years(results: list[CandidateRecord]) -> list[str]:
|
|
return [r["year"] for r in results]
|
|
|
|
|
|
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
|
|
"""Return True if any result title contains fragment (case-insensitive)."""
|
|
low = fragment.lower()
|
|
return any(low in r["title"].lower() for r in results)
|
|
|
|
|
|
def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
|
|
"""Return True if any result author contains fragment (case-insensitive)."""
|
|
low = fragment.lower()
|
|
return any(low in r["author"].lower() for r in results)
|
|
|
|
|
|
def _valid_year(year: str) -> bool:
|
|
"""Return True if year is a 4-digit string or empty."""
|
|
return year == "" or bool(_YEAR_PAT.match(year))
|
|
|
|
|
|
# ── OpenLibrary ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_openlibrary_war_and_peace() -> None:
|
|
plugin = OpenLibraryPlugin(
|
|
plugin_id="openlibrary",
|
|
name="OpenLibrary",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("War and Peace Tolstoy")
|
|
assert results, "OpenLibrary returned no results"
|
|
assert all(r["source"] == "openlibrary" for r in results)
|
|
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
|
|
# OpenLibrary stores authors in their original language; accept both forms.
|
|
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
|
# OpenLibrary returns isbn and publisher from its JSON API.
|
|
assert all(isinstance(r["isbn"], str) for r in results)
|
|
assert all(isinstance(r["publisher"], str) for r in results)
|
|
|
|
|
|
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_rsl_voina_i_mir() -> None:
|
|
plugin = RSLPlugin(
|
|
plugin_id="rsl",
|
|
name="РГБ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("Толстой Война и мир")
|
|
assert results, "RSL returned no results"
|
|
assert all(r["source"] == "rsl" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
|
assert all(r["isbn"] == "" for r in results)
|
|
assert all(r["publisher"] == "" for r in results)
|
|
|
|
|
|
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_rusneb_voina_i_mir() -> None:
|
|
plugin = RusnebPlugin(
|
|
plugin_id="rusneb",
|
|
name="НЭБ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("Война и мир Толстой")
|
|
assert results, "НЭБ returned no results"
|
|
assert all(r["source"] == "rusneb" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
|
assert all(r["isbn"] == "" for r in results)
|
|
assert all(r["publisher"] == "" for r in results)
|
|
|
|
|
|
# ── Alib ─────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_alib_voina_i_mir() -> None:
|
|
plugin = AlibPlugin(
|
|
plugin_id="alib_web",
|
|
name="Alib (web)",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("Война и мир Толстой")
|
|
assert results, "Alib returned no results"
|
|
assert all(r["source"] == "alib_web" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
# Alib entries always include a publication year in the bibliographic text.
|
|
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
|
|
assert all(r["isbn"] == "" for r in results)
|
|
assert all(r["publisher"] == "" for r in results)
|
|
|
|
|
|
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
|
|
# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
|
|
|
|
|
|
@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
|
|
def test_nlr_voina_i_mir() -> None:
|
|
plugin = SRUCatalogPlugin(
|
|
plugin_id="nlr",
|
|
name="НЛР",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={
|
|
"url": "http://www.nlr.ru/search/query",
|
|
"query_prefix": "title=",
|
|
},
|
|
)
|
|
results = plugin.search("Война и мир")
|
|
assert results, "НЛР returned no results"
|
|
assert all(r["source"] == "nlr" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
|
assert all(r["isbn"] == "" for r in results)
|
|
assert all(r["publisher"] == "" for r in results)
|
|
|
|
|
|
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
|
|
# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
|
|
|
|
|
|
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
|
|
def test_shpl_voina_i_mir() -> None:
|
|
plugin = ShplPlugin(
|
|
plugin_id="shpl",
|
|
name="ШПИЛ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("Война и мир")
|
|
assert results, "ШПИЛ returned no results"
|
|
assert all(r["source"] == "shpl" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
|
assert all(r["isbn"] == "" for r in results)
|
|
assert all(r["publisher"] == "" for r in results)
|