Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions

View File

@@ -57,28 +57,17 @@ functions:
rusneb:
name: "НЭБ"
type: html_scraper
type: rusneb
auto_queue: true
rate_limit_seconds: 5
timeout: 8
config:
url: "https://rusneb.ru/search/"
search_param: q
img_alt: true
author_class: "search-list__item_subtext"
alib_web:
name: "Alib (web)"
type: html_scraper
type: alib_web
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "https://www.alib.ru/find3.php4"
search_param: tfind
extra_params: {f: "5", s: "0"}
encoding: "cp1251"
bold_text: true
nlr:
name: "НЛР"
@@ -91,13 +80,9 @@ functions:
query_prefix: "title="
shpl:
# Endpoint currently returns HTTP 404; retained for future re-enablement.
name: "ШПИЛ"
type: html_scraper
type: shpl
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
search_param: S21ALL
extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""}
brief_class: "brief"

View File

@@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {} # populated lazily on first call
def _archive_classes() -> dict[str, Any]:
if not _type_to_class:
from .archives.html_scraper import HtmlScraperPlugin
from .archives.alib import AlibPlugin
from .archives.openlibrary import OpenLibraryPlugin
from .archives.rsl import RSLPlugin
from .archives.rusneb import RusnebPlugin
from .archives.shpl import ShplPlugin
from .archives.sru_catalog import SRUCatalogPlugin
_type_to_class.update(
{
"openlibrary": OpenLibraryPlugin,
"rsl": RSLPlugin,
"html_scraper": HtmlScraperPlugin,
"rusneb": RusnebPlugin,
"alib_web": AlibPlugin,
"shpl": ShplPlugin,
"sru_catalog": SRUCatalogPlugin,
}
)

View File

@@ -0,0 +1,70 @@
"""Alib (alib.ru) archive search plugin."""
import re
from urllib.parse import quote
import httpx
from models import CandidateRecord
from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin
_URL = "https://www.alib.ru/find3.php4"
_DOMAIN = "www.alib.ru"
_ENCODING = "cp1251"
_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"}
# Book entries appear as <p><b>Author Title Year Publisher…</b>
_ENTRY_RE = re.compile(r"<p><b>([^<]{5,200})</b>")
class AlibPlugin(HtmlScraperPlugin):
"""Archive searcher for alib.ru.
Fetches search results with Windows-1251 encoding and extracts book records
from ``<p><b>Author Title Year...</b>`` entries. Author surname and initials
are split from the remaining text using a Cyrillic/Latin initial pattern.
Year is extracted from within each entry rather than from the page globally.
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search Alib for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
q_enc = quote(query.encode(_ENCODING, "replace"))
ep: dict[str, str] = dict(_EXTRA_PARAMS)
ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.content.decode(_ENCODING, errors="replace")
out: list[CandidateRecord] = []
for entry in _ENTRY_RE.findall(html)[:3]:
text = entry.strip()
year_m = YEAR_RE.search(text)
year = year_m.group(0) if year_m else ""
m = AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=year,
isbn="",
publisher="",
)
)
return out

View File

@@ -1,27 +1,17 @@
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
from urllib.parse import quote, urlparse
import httpx
from models import CandidateRecord
from ..rate_limiter import RateLimiter
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
# Support both single and double-quoted class attributes.
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
return out
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
class HtmlScraperPlugin:
"""Config-driven HTML scraper.
"""Base class for HTML-scraping archive plugins.
Supported config keys:
url — search URL
search_param — query param name
extra_params — dict of fixed extra query parameters
encoding — character encoding for query and response (e.g. "cp1251")
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy)
brief_class — CSS class for brief record rows (brief strategy)
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
Handles common initialisation; subclasses implement search() with
site-specific hardcoded logic. The config dict is accepted for
registry compatibility but is not used by the base class; all scraping
details are hardcoded in the subclass.
"""
category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
self.config = config
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string (author, title, keywords).
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
encoding = str(cfg.get("encoding") or "")
if encoding:
# Encode query and extra params in the site's native encoding.
q_enc = quote(query.encode(encoding, "replace"))
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
r = httpx.get(
f'{cfg["url"]}?{raw_qs}',
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.content.decode(encoding, errors="replace")
else:
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
years = _YEAR_RE.findall(html)
if cfg.get("bold_text"):
return self._parse_bold_text(html, years)
if cfg.get("img_alt"):
return self._parse_img_alt(html, years, cfg)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
return self._parse_class(html, years, cfg)
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
The bold text is expected to begin with ``Surname I.N. Title…``; the
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
Returns:
Up to three CandidateRecord dicts.
"""
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
out: list[CandidateRecord] = []
for i, entry in enumerate(entries):
text = entry.strip()
m = _AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
)
return out
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
Used for sites like rusneb.ru where thumbnail alt attributes carry the
book title and a separate span contains the author.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
cfg: Plugin config dict (reads ``author_class``).
Returns:
Up to three CandidateRecord dicts.
"""
titles = _img_alts(html)
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
return [
CandidateRecord(
source=self.plugin_id,
title=title,
author=authors[i] if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
href_pat = cfg.get("link_href_pattern", r"")
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]
raise NotImplementedError

View File

@@ -0,0 +1,64 @@
"""НЭБ (rusneb.ru) archive search plugin."""
import re
import httpx
from models import CandidateRecord
from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
_URL = "https://rusneb.ru/search/"
_DOMAIN = "rusneb.ru"
_AUTHOR_CLASS = "search-list__item_subtext"
# Each search result is a <li> whose class contains search-list__item but not a BEM
# child element suffix (which would begin with underscore, e.g. __item_subtext).
_ITEM_RE = re.compile(
r'<li[^>]*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)</li>',
re.DOTALL,
)
class RusnebPlugin(HtmlScraperPlugin):
"""Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека).
Extracts book titles from ``<img alt>`` attributes within search result list
items and authors from ``.search-list__item_subtext`` spans. Years are
extracted per list item to avoid picking up unrelated page-level dates.
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search НЭБ for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.text
out: list[CandidateRecord] = []
for item_html in _ITEM_RE.findall(html):
alts = img_alts(item_html)
if not alts:
continue
authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
year_m = YEAR_RE.search(item_html)
out.append(
CandidateRecord(
source=self.plugin_id,
title=alts[0],
author=authors[0] if authors else "",
year=year_m.group(0) if year_m else "",
isbn="",
publisher="",
)
)
if len(out) == 3:
break
return out

View File

@@ -0,0 +1,63 @@
"""ШПИЛ archive search plugin.
Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
produces no results. The class is retained so the configuration entry can
be re-enabled if the endpoint is restored.
"""
import re
import httpx
from models import CandidateRecord
from .html_scraper import YEAR_RE, HtmlScraperPlugin
_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
_DOMAIN = "www.shpl.ru"
_EXTRA_PARAMS: dict[str, str] = {
"C21COM": "S",
"I21DBN": "BIBL",
"P21DBN": "BIBL",
"S21FMT": "briefWebRus",
"Z21ID": "",
}
_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
class ShplPlugin(HtmlScraperPlugin):
"""Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
Extracts brief record entries from elements with class ``brief``.
The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search ШПИЛ for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
params: dict[str, str] = dict(_EXTRA_PARAMS)
params["S21ALL"] = query
r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.text
years = YEAR_RE.findall(html)
titles = _BRIEF_RE.findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]

View File

@@ -7,12 +7,16 @@ Run with: pytest tests/ -m network
Skip with: pytest tests/ -m "not network" (default in presubmit)
"""
import re
import pytest
from models import CandidateRecord
from plugins.archives.html_scraper import HtmlScraperPlugin
from plugins.archives.alib import AlibPlugin
from plugins.archives.openlibrary import OpenLibraryPlugin
from plugins.archives.rsl import RSLPlugin
from plugins.archives.rusneb import RusnebPlugin
from plugins.archives.shpl import ShplPlugin
from plugins.archives.sru_catalog import SRUCatalogPlugin
from plugins.rate_limiter import RateLimiter
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
_RL = RateLimiter()
_TIMEOUT = 15
_YEAR_PAT = re.compile(r"^\d{4}$")
def _titles(results: list[CandidateRecord]) -> list[str]:
return [r["title"] for r in results]
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
return [r["author"] for r in results]
def _years(results: list[CandidateRecord]) -> list[str]:
return [r["year"] for r in results]
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result title contains fragment (case-insensitive)."""
low = fragment.lower()
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
return any(low in r["author"].lower() for r in results)
def _valid_year(year: str) -> bool:
"""Return True if year is a 4-digit string or empty."""
return year == "" or bool(_YEAR_PAT.match(year))
# ── OpenLibrary ───────────────────────────────────────────────────────────────
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
# OpenLibrary stores authors in their original language; accept both forms.
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
# OpenLibrary returns isbn and publisher from its JSON API.
assert all(isinstance(r["isbn"], str) for r in results)
assert all(isinstance(r["publisher"], str) for r in results)
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
assert results, "RSL returned no results"
assert all(r["source"] == "rsl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
def test_rusneb_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = RusnebPlugin(
plugin_id="rusneb",
name="НЭБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={
"url": "https://rusneb.ru/search/",
"search_param": "q",
"img_alt": True,
"author_class": "search-list__item_subtext",
},
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "НЭБ returned no results"
assert all(r["source"] == "rusneb" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── Alib ─────────────────────────────────────────────────────────────────────
def test_alib_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = AlibPlugin(
plugin_id="alib_web",
name="Alib (web)",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.alib.ru/find3.php4",
"search_param": "tfind",
"extra_params": {"f": "5", "s": "0"},
"encoding": "cp1251",
"bold_text": True,
},
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "Alib returned no results"
assert all(r["source"] == "alib_web" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
# Alib entries always include a publication year in the bibliographic text.
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
assert results, "НЛР returned no results"
assert all(r["source"] == "nlr" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
def test_shpl_voina_i_mir() -> None:
plugin = HtmlScraperPlugin(
plugin = ShplPlugin(
plugin_id="shpl",
name="ШПИЛ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
"search_param": "S21ALL",
"extra_params": {
"C21COM": "S",
"I21DBN": "BIBL",
"P21DBN": "BIBL",
"S21FMT": "briefWebRus",
"Z21ID": "",
},
"brief_class": "brief",
},
config={},
)
results = plugin.search("Война и мир")
assert results, "ШПИЛ returned no results"
assert all(r["source"] == "shpl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)