Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,27 +1,17 @@
|
||||
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
|
||||
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import quote, urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from models import CandidateRecord
|
||||
|
||||
from ..rate_limiter import RateLimiter
|
||||
|
||||
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
||||
|
||||
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
|
||||
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
|
||||
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
||||
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
|
||||
|
||||
|
||||
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
|
||||
# Support both single and double-quoted class attributes.
|
||||
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
|
||||
|
||||
|
||||
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
|
||||
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
|
||||
"""Extract text content from elements whose class contains cls_frag.
|
||||
|
||||
Strips inner HTML tags and normalises whitespace, so elements like
|
||||
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
|
||||
return out
|
||||
|
||||
|
||||
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
||||
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
||||
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
|
||||
|
||||
Args:
|
||||
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
||||
|
||||
|
||||
class HtmlScraperPlugin:
|
||||
"""Config-driven HTML scraper.
|
||||
"""Base class for HTML-scraping archive plugins.
|
||||
|
||||
Supported config keys:
|
||||
url — search URL
|
||||
search_param — query param name
|
||||
extra_params — dict of fixed extra query parameters
|
||||
encoding — character encoding for query and response (e.g. "cp1251")
|
||||
title_class — CSS class fragment for title elements (class-based strategy)
|
||||
author_class — CSS class fragment for author elements
|
||||
link_href_pattern — href regex to find title <a> links (link strategy)
|
||||
brief_class — CSS class for brief record rows (brief strategy)
|
||||
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
|
||||
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
|
||||
Handles common initialisation; subclasses implement search() with
|
||||
site-specific hardcoded logic. The config dict is accepted for
|
||||
registry compatibility but is not used by the base class; all scraping
|
||||
details are hardcoded in the subclass.
|
||||
"""
|
||||
|
||||
category = "archive_searchers"
|
||||
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
|
||||
self.rate_limit_seconds = rate_limit_seconds
|
||||
self.auto_queue = auto_queue
|
||||
self.timeout = timeout
|
||||
self.config = config
|
||||
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
|
||||
|
||||
def search(self, query: str) -> list[CandidateRecord]:
|
||||
"""Search for books matching query.
|
||||
|
||||
Args:
|
||||
query: Free-text search string (author, title, keywords).
|
||||
query: Free-text search string.
|
||||
|
||||
Returns:
|
||||
Up to three CandidateRecord dicts with source, title, author, year,
|
||||
isbn, and publisher fields.
|
||||
"""
|
||||
cfg = self.config
|
||||
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
|
||||
|
||||
encoding = str(cfg.get("encoding") or "")
|
||||
if encoding:
|
||||
# Encode query and extra params in the site's native encoding.
|
||||
q_enc = quote(query.encode(encoding, "replace"))
|
||||
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
|
||||
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
|
||||
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
|
||||
r = httpx.get(
|
||||
f'{cfg["url"]}?{raw_qs}',
|
||||
timeout=self.timeout,
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
)
|
||||
html = r.content.decode(encoding, errors="replace")
|
||||
else:
|
||||
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
|
||||
params[cfg["search_param"]] = query
|
||||
r = httpx.get(
|
||||
cfg["url"],
|
||||
params=params,
|
||||
timeout=self.timeout,
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
)
|
||||
html = r.text
|
||||
|
||||
years = _YEAR_RE.findall(html)
|
||||
|
||||
if cfg.get("bold_text"):
|
||||
return self._parse_bold_text(html, years)
|
||||
if cfg.get("img_alt"):
|
||||
return self._parse_img_alt(html, years, cfg)
|
||||
if "link_href_pattern" in cfg:
|
||||
return self._parse_link(html, years, cfg)
|
||||
if "brief_class" in cfg:
|
||||
return self._parse_brief(html, years, cfg)
|
||||
return self._parse_class(html, years, cfg)
|
||||
|
||||
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
|
||||
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
|
||||
|
||||
The bold text is expected to begin with ``Surname I.N. Title…``; the
|
||||
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
|
||||
|
||||
Args:
|
||||
html: Decoded HTML response.
|
||||
years: Year strings found in the full HTML (used positionally).
|
||||
|
||||
Returns:
|
||||
Up to three CandidateRecord dicts.
|
||||
"""
|
||||
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
|
||||
out: list[CandidateRecord] = []
|
||||
for i, entry in enumerate(entries):
|
||||
text = entry.strip()
|
||||
m = _AUTHOR_PREFIX_PAT.match(text)
|
||||
if m:
|
||||
author = m.group(1).strip()
|
||||
title = m.group(2).strip()
|
||||
else:
|
||||
author = ""
|
||||
title = text
|
||||
out.append(
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title,
|
||||
author=author,
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
|
||||
|
||||
Used for sites like rusneb.ru where thumbnail alt attributes carry the
|
||||
book title and a separate span contains the author.
|
||||
|
||||
Args:
|
||||
html: Decoded HTML response.
|
||||
years: Year strings found in the full HTML (used positionally).
|
||||
cfg: Plugin config dict (reads ``author_class``).
|
||||
|
||||
Returns:
|
||||
Up to three CandidateRecord dicts.
|
||||
"""
|
||||
titles = _img_alts(html)
|
||||
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title,
|
||||
author=authors[i] if i < len(authors) else "",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, title in enumerate(titles)
|
||||
]
|
||||
|
||||
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
|
||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title.strip(),
|
||||
author=authors[i].strip() if i < len(authors) else "",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, title in enumerate(titles)
|
||||
]
|
||||
|
||||
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
href_pat = cfg.get("link_href_pattern", r"")
|
||||
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
|
||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title.strip(),
|
||||
author=authors[i].strip() if i < len(authors) else "",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, title in enumerate(titles)
|
||||
]
|
||||
|
||||
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=t.strip(),
|
||||
author="",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, t in enumerate(titles)
|
||||
]
|
||||
raise NotImplementedError
|
||||
|
||||
Reference in New Issue
Block a user