Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions

View File

@@ -1,27 +1,17 @@
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
from urllib.parse import quote, urlparse
import httpx
from models import CandidateRecord
from ..rate_limiter import RateLimiter
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
# Support both single and double-quoted class attributes.
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
return out
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
class HtmlScraperPlugin:
"""Config-driven HTML scraper.
"""Base class for HTML-scraping archive plugins.
Supported config keys:
url — search URL
search_param — query param name
extra_params — dict of fixed extra query parameters
encoding — character encoding for query and response (e.g. "cp1251")
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy)
brief_class — CSS class for brief record rows (brief strategy)
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
Handles common initialisation; subclasses implement search() with
site-specific hardcoded logic. The config dict is accepted for
registry compatibility but is not used by the base class; all scraping
details are hardcoded in the subclass.
"""
category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
self.config = config
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string (author, title, keywords).
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
encoding = str(cfg.get("encoding") or "")
if encoding:
# Encode query and extra params in the site's native encoding.
q_enc = quote(query.encode(encoding, "replace"))
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
r = httpx.get(
f'{cfg["url"]}?{raw_qs}',
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.content.decode(encoding, errors="replace")
else:
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
years = _YEAR_RE.findall(html)
if cfg.get("bold_text"):
return self._parse_bold_text(html, years)
if cfg.get("img_alt"):
return self._parse_img_alt(html, years, cfg)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
return self._parse_class(html, years, cfg)
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
The bold text is expected to begin with ``Surname I.N. Title…``; the
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
Returns:
Up to three CandidateRecord dicts.
"""
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
out: list[CandidateRecord] = []
for i, entry in enumerate(entries):
text = entry.strip()
m = _AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
)
return out
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
Used for sites like rusneb.ru where thumbnail alt attributes carry the
book title and a separate span contains the author.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
cfg: Plugin config dict (reads ``author_class``).
Returns:
Up to three CandidateRecord dicts.
"""
titles = _img_alts(html)
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
return [
CandidateRecord(
source=self.plugin_id,
title=title,
author=authors[i] if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
href_pat = cfg.get("link_href_pattern", r"")
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]
raise NotImplementedError