Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -57,28 +57,17 @@ functions:
|
|||||||
|
|
||||||
rusneb:
|
rusneb:
|
||||||
name: "НЭБ"
|
name: "НЭБ"
|
||||||
type: html_scraper
|
type: rusneb
|
||||||
auto_queue: true
|
auto_queue: true
|
||||||
rate_limit_seconds: 5
|
rate_limit_seconds: 5
|
||||||
timeout: 8
|
timeout: 8
|
||||||
config:
|
|
||||||
url: "https://rusneb.ru/search/"
|
|
||||||
search_param: q
|
|
||||||
img_alt: true
|
|
||||||
author_class: "search-list__item_subtext"
|
|
||||||
|
|
||||||
alib_web:
|
alib_web:
|
||||||
name: "Alib (web)"
|
name: "Alib (web)"
|
||||||
type: html_scraper
|
type: alib_web
|
||||||
auto_queue: false
|
auto_queue: false
|
||||||
rate_limit_seconds: 5
|
rate_limit_seconds: 5
|
||||||
timeout: 8
|
timeout: 8
|
||||||
config:
|
|
||||||
url: "https://www.alib.ru/find3.php4"
|
|
||||||
search_param: tfind
|
|
||||||
extra_params: {f: "5", s: "0"}
|
|
||||||
encoding: "cp1251"
|
|
||||||
bold_text: true
|
|
||||||
|
|
||||||
nlr:
|
nlr:
|
||||||
name: "НЛР"
|
name: "НЛР"
|
||||||
@@ -91,13 +80,9 @@ functions:
|
|||||||
query_prefix: "title="
|
query_prefix: "title="
|
||||||
|
|
||||||
shpl:
|
shpl:
|
||||||
|
# Endpoint currently returns HTTP 404; retained for future re-enablement.
|
||||||
name: "ШПИЛ"
|
name: "ШПИЛ"
|
||||||
type: html_scraper
|
type: shpl
|
||||||
auto_queue: false
|
auto_queue: false
|
||||||
rate_limit_seconds: 5
|
rate_limit_seconds: 5
|
||||||
timeout: 8
|
timeout: 8
|
||||||
config:
|
|
||||||
url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
|
|
||||||
search_param: S21ALL
|
|
||||||
extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""}
|
|
||||||
brief_class: "brief"
|
|
||||||
|
|||||||
@@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {} # populated lazily on first call
|
|||||||
|
|
||||||
def _archive_classes() -> dict[str, Any]:
|
def _archive_classes() -> dict[str, Any]:
|
||||||
if not _type_to_class:
|
if not _type_to_class:
|
||||||
from .archives.html_scraper import HtmlScraperPlugin
|
from .archives.alib import AlibPlugin
|
||||||
from .archives.openlibrary import OpenLibraryPlugin
|
from .archives.openlibrary import OpenLibraryPlugin
|
||||||
from .archives.rsl import RSLPlugin
|
from .archives.rsl import RSLPlugin
|
||||||
|
from .archives.rusneb import RusnebPlugin
|
||||||
|
from .archives.shpl import ShplPlugin
|
||||||
from .archives.sru_catalog import SRUCatalogPlugin
|
from .archives.sru_catalog import SRUCatalogPlugin
|
||||||
|
|
||||||
_type_to_class.update(
|
_type_to_class.update(
|
||||||
{
|
{
|
||||||
"openlibrary": OpenLibraryPlugin,
|
"openlibrary": OpenLibraryPlugin,
|
||||||
"rsl": RSLPlugin,
|
"rsl": RSLPlugin,
|
||||||
"html_scraper": HtmlScraperPlugin,
|
"rusneb": RusnebPlugin,
|
||||||
|
"alib_web": AlibPlugin,
|
||||||
|
"shpl": ShplPlugin,
|
||||||
"sru_catalog": SRUCatalogPlugin,
|
"sru_catalog": SRUCatalogPlugin,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
70
src/plugins/archives/alib.py
Normal file
70
src/plugins/archives/alib.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
"""Alib (alib.ru) archive search plugin."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from models import CandidateRecord
|
||||||
|
|
||||||
|
from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin
|
||||||
|
|
||||||
|
_URL = "https://www.alib.ru/find3.php4"
|
||||||
|
_DOMAIN = "www.alib.ru"
|
||||||
|
_ENCODING = "cp1251"
|
||||||
|
_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"}
|
||||||
|
|
||||||
|
# Book entries appear as <p><b>Author Title Year Publisher…</b>
|
||||||
|
_ENTRY_RE = re.compile(r"<p><b>([^<]{5,200})</b>")
|
||||||
|
|
||||||
|
|
||||||
|
class AlibPlugin(HtmlScraperPlugin):
|
||||||
|
"""Archive searcher for alib.ru.
|
||||||
|
|
||||||
|
Fetches search results with Windows-1251 encoding and extracts book records
|
||||||
|
from ``<p><b>Author Title Year...</b>`` entries. Author surname and initials
|
||||||
|
are split from the remaining text using a Cyrillic/Latin initial pattern.
|
||||||
|
Year is extracted from within each entry rather than from the page globally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def search(self, query: str) -> list[CandidateRecord]:
|
||||||
|
"""Search Alib for books matching query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Free-text search string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Up to three CandidateRecord dicts with source, title, author, year,
|
||||||
|
isbn, and publisher fields.
|
||||||
|
"""
|
||||||
|
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
|
||||||
|
q_enc = quote(query.encode(_ENCODING, "replace"))
|
||||||
|
ep: dict[str, str] = dict(_EXTRA_PARAMS)
|
||||||
|
ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
|
||||||
|
raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
|
||||||
|
r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
html = r.content.decode(_ENCODING, errors="replace")
|
||||||
|
|
||||||
|
out: list[CandidateRecord] = []
|
||||||
|
for entry in _ENTRY_RE.findall(html)[:3]:
|
||||||
|
text = entry.strip()
|
||||||
|
year_m = YEAR_RE.search(text)
|
||||||
|
year = year_m.group(0) if year_m else ""
|
||||||
|
m = AUTHOR_PREFIX_PAT.match(text)
|
||||||
|
if m:
|
||||||
|
author = m.group(1).strip()
|
||||||
|
title = m.group(2).strip()
|
||||||
|
else:
|
||||||
|
author = ""
|
||||||
|
title = text
|
||||||
|
out.append(
|
||||||
|
CandidateRecord(
|
||||||
|
source=self.plugin_id,
|
||||||
|
title=title,
|
||||||
|
author=author,
|
||||||
|
year=year,
|
||||||
|
isbn="",
|
||||||
|
publisher="",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return out
|
||||||
@@ -1,27 +1,17 @@
|
|||||||
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
|
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import quote, urlparse
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from models import CandidateRecord
|
from models import CandidateRecord
|
||||||
|
|
||||||
from ..rate_limiter import RateLimiter
|
from ..rate_limiter import RateLimiter
|
||||||
|
|
||||||
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
||||||
|
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
|
||||||
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
|
|
||||||
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
|
|
||||||
|
|
||||||
|
|
||||||
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
|
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
|
||||||
# Support both single and double-quoted class attributes.
|
|
||||||
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
|
|
||||||
|
|
||||||
|
|
||||||
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
|
|
||||||
"""Extract text content from elements whose class contains cls_frag.
|
"""Extract text content from elements whose class contains cls_frag.
|
||||||
|
|
||||||
Strips inner HTML tags and normalises whitespace, so elements like
|
Strips inner HTML tags and normalises whitespace, so elements like
|
||||||
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
||||||
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
|
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
|||||||
|
|
||||||
|
|
||||||
class HtmlScraperPlugin:
|
class HtmlScraperPlugin:
|
||||||
"""Config-driven HTML scraper.
|
"""Base class for HTML-scraping archive plugins.
|
||||||
|
|
||||||
Supported config keys:
|
Handles common initialisation; subclasses implement search() with
|
||||||
url — search URL
|
site-specific hardcoded logic. The config dict is accepted for
|
||||||
search_param — query param name
|
registry compatibility but is not used by the base class; all scraping
|
||||||
extra_params — dict of fixed extra query parameters
|
details are hardcoded in the subclass.
|
||||||
encoding — character encoding for query and response (e.g. "cp1251")
|
|
||||||
title_class — CSS class fragment for title elements (class-based strategy)
|
|
||||||
author_class — CSS class fragment for author elements
|
|
||||||
link_href_pattern — href regex to find title <a> links (link strategy)
|
|
||||||
brief_class — CSS class for brief record rows (brief strategy)
|
|
||||||
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
|
|
||||||
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
category = "archive_searchers"
|
category = "archive_searchers"
|
||||||
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
|
|||||||
self.rate_limit_seconds = rate_limit_seconds
|
self.rate_limit_seconds = rate_limit_seconds
|
||||||
self.auto_queue = auto_queue
|
self.auto_queue = auto_queue
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.config = config
|
|
||||||
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
|
|
||||||
|
|
||||||
def search(self, query: str) -> list[CandidateRecord]:
|
def search(self, query: str) -> list[CandidateRecord]:
|
||||||
"""Search for books matching query.
|
"""Search for books matching query.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: Free-text search string (author, title, keywords).
|
query: Free-text search string.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Up to three CandidateRecord dicts with source, title, author, year,
|
Up to three CandidateRecord dicts with source, title, author, year,
|
||||||
isbn, and publisher fields.
|
isbn, and publisher fields.
|
||||||
"""
|
"""
|
||||||
cfg = self.config
|
raise NotImplementedError
|
||||||
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
|
|
||||||
|
|
||||||
encoding = str(cfg.get("encoding") or "")
|
|
||||||
if encoding:
|
|
||||||
# Encode query and extra params in the site's native encoding.
|
|
||||||
q_enc = quote(query.encode(encoding, "replace"))
|
|
||||||
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
|
|
||||||
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
|
|
||||||
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
|
|
||||||
r = httpx.get(
|
|
||||||
f'{cfg["url"]}?{raw_qs}',
|
|
||||||
timeout=self.timeout,
|
|
||||||
headers={"User-Agent": "Mozilla/5.0"},
|
|
||||||
)
|
|
||||||
html = r.content.decode(encoding, errors="replace")
|
|
||||||
else:
|
|
||||||
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
|
|
||||||
params[cfg["search_param"]] = query
|
|
||||||
r = httpx.get(
|
|
||||||
cfg["url"],
|
|
||||||
params=params,
|
|
||||||
timeout=self.timeout,
|
|
||||||
headers={"User-Agent": "Mozilla/5.0"},
|
|
||||||
)
|
|
||||||
html = r.text
|
|
||||||
|
|
||||||
years = _YEAR_RE.findall(html)
|
|
||||||
|
|
||||||
if cfg.get("bold_text"):
|
|
||||||
return self._parse_bold_text(html, years)
|
|
||||||
if cfg.get("img_alt"):
|
|
||||||
return self._parse_img_alt(html, years, cfg)
|
|
||||||
if "link_href_pattern" in cfg:
|
|
||||||
return self._parse_link(html, years, cfg)
|
|
||||||
if "brief_class" in cfg:
|
|
||||||
return self._parse_brief(html, years, cfg)
|
|
||||||
return self._parse_class(html, years, cfg)
|
|
||||||
|
|
||||||
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
|
|
||||||
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
|
|
||||||
|
|
||||||
The bold text is expected to begin with ``Surname I.N. Title…``; the
|
|
||||||
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html: Decoded HTML response.
|
|
||||||
years: Year strings found in the full HTML (used positionally).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Up to three CandidateRecord dicts.
|
|
||||||
"""
|
|
||||||
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
|
|
||||||
out: list[CandidateRecord] = []
|
|
||||||
for i, entry in enumerate(entries):
|
|
||||||
text = entry.strip()
|
|
||||||
m = _AUTHOR_PREFIX_PAT.match(text)
|
|
||||||
if m:
|
|
||||||
author = m.group(1).strip()
|
|
||||||
title = m.group(2).strip()
|
|
||||||
else:
|
|
||||||
author = ""
|
|
||||||
title = text
|
|
||||||
out.append(
|
|
||||||
CandidateRecord(
|
|
||||||
source=self.plugin_id,
|
|
||||||
title=title,
|
|
||||||
author=author,
|
|
||||||
year=years[i] if i < len(years) else "",
|
|
||||||
isbn="",
|
|
||||||
publisher="",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
|
||||||
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
|
|
||||||
|
|
||||||
Used for sites like rusneb.ru where thumbnail alt attributes carry the
|
|
||||||
book title and a separate span contains the author.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html: Decoded HTML response.
|
|
||||||
years: Year strings found in the full HTML (used positionally).
|
|
||||||
cfg: Plugin config dict (reads ``author_class``).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Up to three CandidateRecord dicts.
|
|
||||||
"""
|
|
||||||
titles = _img_alts(html)
|
|
||||||
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
|
|
||||||
return [
|
|
||||||
CandidateRecord(
|
|
||||||
source=self.plugin_id,
|
|
||||||
title=title,
|
|
||||||
author=authors[i] if i < len(authors) else "",
|
|
||||||
year=years[i] if i < len(years) else "",
|
|
||||||
isbn="",
|
|
||||||
publisher="",
|
|
||||||
)
|
|
||||||
for i, title in enumerate(titles)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
|
||||||
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
|
|
||||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
|
||||||
return [
|
|
||||||
CandidateRecord(
|
|
||||||
source=self.plugin_id,
|
|
||||||
title=title.strip(),
|
|
||||||
author=authors[i].strip() if i < len(authors) else "",
|
|
||||||
year=years[i] if i < len(years) else "",
|
|
||||||
isbn="",
|
|
||||||
publisher="",
|
|
||||||
)
|
|
||||||
for i, title in enumerate(titles)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
|
||||||
href_pat = cfg.get("link_href_pattern", r"")
|
|
||||||
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
|
|
||||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
|
||||||
return [
|
|
||||||
CandidateRecord(
|
|
||||||
source=self.plugin_id,
|
|
||||||
title=title.strip(),
|
|
||||||
author=authors[i].strip() if i < len(authors) else "",
|
|
||||||
year=years[i] if i < len(years) else "",
|
|
||||||
isbn="",
|
|
||||||
publisher="",
|
|
||||||
)
|
|
||||||
for i, title in enumerate(titles)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
|
||||||
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
|
|
||||||
return [
|
|
||||||
CandidateRecord(
|
|
||||||
source=self.plugin_id,
|
|
||||||
title=t.strip(),
|
|
||||||
author="",
|
|
||||||
year=years[i] if i < len(years) else "",
|
|
||||||
isbn="",
|
|
||||||
publisher="",
|
|
||||||
)
|
|
||||||
for i, t in enumerate(titles)
|
|
||||||
]
|
|
||||||
|
|||||||
64
src/plugins/archives/rusneb.py
Normal file
64
src/plugins/archives/rusneb.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""НЭБ (rusneb.ru) archive search plugin."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from models import CandidateRecord
|
||||||
|
|
||||||
|
from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
|
||||||
|
|
||||||
|
_URL = "https://rusneb.ru/search/"
|
||||||
|
_DOMAIN = "rusneb.ru"
|
||||||
|
_AUTHOR_CLASS = "search-list__item_subtext"
|
||||||
|
|
||||||
|
# Each search result is a <li> whose class contains search-list__item but not a BEM
|
||||||
|
# child element suffix (which would begin with underscore, e.g. __item_subtext).
|
||||||
|
_ITEM_RE = re.compile(
|
||||||
|
r'<li[^>]*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)</li>',
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RusnebPlugin(HtmlScraperPlugin):
|
||||||
|
"""Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека).
|
||||||
|
|
||||||
|
Extracts book titles from ``<img alt>`` attributes within search result list
|
||||||
|
items and authors from ``.search-list__item_subtext`` spans. Years are
|
||||||
|
extracted per list item to avoid picking up unrelated page-level dates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def search(self, query: str) -> list[CandidateRecord]:
|
||||||
|
"""Search НЭБ for books matching query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Free-text search string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Up to three CandidateRecord dicts with source, title, author, year,
|
||||||
|
isbn, and publisher fields.
|
||||||
|
"""
|
||||||
|
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
|
||||||
|
r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
html = r.text
|
||||||
|
|
||||||
|
out: list[CandidateRecord] = []
|
||||||
|
for item_html in _ITEM_RE.findall(html):
|
||||||
|
alts = img_alts(item_html)
|
||||||
|
if not alts:
|
||||||
|
continue
|
||||||
|
authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
|
||||||
|
year_m = YEAR_RE.search(item_html)
|
||||||
|
out.append(
|
||||||
|
CandidateRecord(
|
||||||
|
source=self.plugin_id,
|
||||||
|
title=alts[0],
|
||||||
|
author=authors[0] if authors else "",
|
||||||
|
year=year_m.group(0) if year_m else "",
|
||||||
|
isbn="",
|
||||||
|
publisher="",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if len(out) == 3:
|
||||||
|
break
|
||||||
|
return out
|
||||||
63
src/plugins/archives/shpl.py
Normal file
63
src/plugins/archives/shpl.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
"""ШПИЛ archive search plugin.
|
||||||
|
|
||||||
|
Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
|
||||||
|
produces no results. The class is retained so the configuration entry can
|
||||||
|
be re-enabled if the endpoint is restored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from models import CandidateRecord
|
||||||
|
|
||||||
|
from .html_scraper import YEAR_RE, HtmlScraperPlugin
|
||||||
|
|
||||||
|
_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
|
||||||
|
_DOMAIN = "www.shpl.ru"
|
||||||
|
_EXTRA_PARAMS: dict[str, str] = {
|
||||||
|
"C21COM": "S",
|
||||||
|
"I21DBN": "BIBL",
|
||||||
|
"P21DBN": "BIBL",
|
||||||
|
"S21FMT": "briefWebRus",
|
||||||
|
"Z21ID": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
|
||||||
|
|
||||||
|
|
||||||
|
class ShplPlugin(HtmlScraperPlugin):
|
||||||
|
"""Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
|
||||||
|
|
||||||
|
Extracts brief record entries from elements with class ``brief``.
|
||||||
|
The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def search(self, query: str) -> list[CandidateRecord]:
|
||||||
|
"""Search ШПИЛ for books matching query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Free-text search string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Up to three CandidateRecord dicts with source, title, author, year,
|
||||||
|
isbn, and publisher fields.
|
||||||
|
"""
|
||||||
|
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
|
||||||
|
params: dict[str, str] = dict(_EXTRA_PARAMS)
|
||||||
|
params["S21ALL"] = query
|
||||||
|
r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
html = r.text
|
||||||
|
years = YEAR_RE.findall(html)
|
||||||
|
titles = _BRIEF_RE.findall(html)[:3]
|
||||||
|
return [
|
||||||
|
CandidateRecord(
|
||||||
|
source=self.plugin_id,
|
||||||
|
title=t.strip(),
|
||||||
|
author="",
|
||||||
|
year=years[i] if i < len(years) else "",
|
||||||
|
isbn="",
|
||||||
|
publisher="",
|
||||||
|
)
|
||||||
|
for i, t in enumerate(titles)
|
||||||
|
]
|
||||||
@@ -7,12 +7,16 @@ Run with: pytest tests/ -m network
|
|||||||
Skip with: pytest tests/ -m "not network" (default in presubmit)
|
Skip with: pytest tests/ -m "not network" (default in presubmit)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from models import CandidateRecord
|
from models import CandidateRecord
|
||||||
from plugins.archives.html_scraper import HtmlScraperPlugin
|
from plugins.archives.alib import AlibPlugin
|
||||||
from plugins.archives.openlibrary import OpenLibraryPlugin
|
from plugins.archives.openlibrary import OpenLibraryPlugin
|
||||||
from plugins.archives.rsl import RSLPlugin
|
from plugins.archives.rsl import RSLPlugin
|
||||||
|
from plugins.archives.rusneb import RusnebPlugin
|
||||||
|
from plugins.archives.shpl import ShplPlugin
|
||||||
from plugins.archives.sru_catalog import SRUCatalogPlugin
|
from plugins.archives.sru_catalog import SRUCatalogPlugin
|
||||||
from plugins.rate_limiter import RateLimiter
|
from plugins.rate_limiter import RateLimiter
|
||||||
|
|
||||||
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
|
|||||||
_RL = RateLimiter()
|
_RL = RateLimiter()
|
||||||
_TIMEOUT = 15
|
_TIMEOUT = 15
|
||||||
|
|
||||||
|
_YEAR_PAT = re.compile(r"^\d{4}$")
|
||||||
|
|
||||||
|
|
||||||
def _titles(results: list[CandidateRecord]) -> list[str]:
|
def _titles(results: list[CandidateRecord]) -> list[str]:
|
||||||
return [r["title"] for r in results]
|
return [r["title"] for r in results]
|
||||||
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
|
|||||||
return [r["author"] for r in results]
|
return [r["author"] for r in results]
|
||||||
|
|
||||||
|
|
||||||
|
def _years(results: list[CandidateRecord]) -> list[str]:
|
||||||
|
return [r["year"] for r in results]
|
||||||
|
|
||||||
|
|
||||||
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
|
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
|
||||||
"""Return True if any result title contains fragment (case-insensitive)."""
|
"""Return True if any result title contains fragment (case-insensitive)."""
|
||||||
low = fragment.lower()
|
low = fragment.lower()
|
||||||
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
|
|||||||
return any(low in r["author"].lower() for r in results)
|
return any(low in r["author"].lower() for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_year(year: str) -> bool:
|
||||||
|
"""Return True if year is a 4-digit string or empty."""
|
||||||
|
return year == "" or bool(_YEAR_PAT.match(year))
|
||||||
|
|
||||||
|
|
||||||
# ── OpenLibrary ───────────────────────────────────────────────────────────────
|
# ── OpenLibrary ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
|
|||||||
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
|
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
|
||||||
# OpenLibrary stores authors in their original language; accept both forms.
|
# OpenLibrary stores authors in their original language; accept both forms.
|
||||||
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
|
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||||
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
# OpenLibrary returns isbn and publisher from its JSON API.
|
||||||
|
assert all(isinstance(r["isbn"], str) for r in results)
|
||||||
|
assert all(isinstance(r["publisher"], str) for r in results)
|
||||||
|
|
||||||
|
|
||||||
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
|
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
|
||||||
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
|
|||||||
assert results, "RSL returned no results"
|
assert results, "RSL returned no results"
|
||||||
assert all(r["source"] == "rsl" for r in results)
|
assert all(r["source"] == "rsl" for r in results)
|
||||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||||
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
assert all(r["isbn"] == "" for r in results)
|
||||||
|
assert all(r["publisher"] == "" for r in results)
|
||||||
|
|
||||||
|
|
||||||
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
|
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def test_rusneb_voina_i_mir() -> None:
|
def test_rusneb_voina_i_mir() -> None:
|
||||||
plugin = HtmlScraperPlugin(
|
plugin = RusnebPlugin(
|
||||||
plugin_id="rusneb",
|
plugin_id="rusneb",
|
||||||
name="НЭБ",
|
name="НЭБ",
|
||||||
rate_limiter=_RL,
|
rate_limiter=_RL,
|
||||||
rate_limit_seconds=0,
|
rate_limit_seconds=0,
|
||||||
auto_queue=True,
|
auto_queue=True,
|
||||||
timeout=_TIMEOUT,
|
timeout=_TIMEOUT,
|
||||||
config={
|
config={},
|
||||||
"url": "https://rusneb.ru/search/",
|
|
||||||
"search_param": "q",
|
|
||||||
"img_alt": True,
|
|
||||||
"author_class": "search-list__item_subtext",
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
results = plugin.search("Война и мир Толстой")
|
results = plugin.search("Война и мир Толстой")
|
||||||
assert results, "НЭБ returned no results"
|
assert results, "НЭБ returned no results"
|
||||||
assert all(r["source"] == "rusneb" for r in results)
|
assert all(r["source"] == "rusneb" for r in results)
|
||||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||||
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||||
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
assert all(r["isbn"] == "" for r in results)
|
||||||
|
assert all(r["publisher"] == "" for r in results)
|
||||||
|
|
||||||
|
|
||||||
# ── Alib ─────────────────────────────────────────────────────────────────────
|
# ── Alib ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def test_alib_voina_i_mir() -> None:
|
def test_alib_voina_i_mir() -> None:
|
||||||
plugin = HtmlScraperPlugin(
|
plugin = AlibPlugin(
|
||||||
plugin_id="alib_web",
|
plugin_id="alib_web",
|
||||||
name="Alib (web)",
|
name="Alib (web)",
|
||||||
rate_limiter=_RL,
|
rate_limiter=_RL,
|
||||||
rate_limit_seconds=0,
|
rate_limit_seconds=0,
|
||||||
auto_queue=False,
|
auto_queue=False,
|
||||||
timeout=_TIMEOUT,
|
timeout=_TIMEOUT,
|
||||||
config={
|
config={},
|
||||||
"url": "https://www.alib.ru/find3.php4",
|
|
||||||
"search_param": "tfind",
|
|
||||||
"extra_params": {"f": "5", "s": "0"},
|
|
||||||
"encoding": "cp1251",
|
|
||||||
"bold_text": True,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
results = plugin.search("Война и мир Толстой")
|
results = plugin.search("Война и мир Толстой")
|
||||||
assert results, "Alib returned no results"
|
assert results, "Alib returned no results"
|
||||||
assert all(r["source"] == "alib_web" for r in results)
|
assert all(r["source"] == "alib_web" for r in results)
|
||||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||||
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||||
|
# Alib entries always include a publication year in the bibliographic text.
|
||||||
|
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
assert all(r["isbn"] == "" for r in results)
|
||||||
|
assert all(r["publisher"] == "" for r in results)
|
||||||
|
|
||||||
|
|
||||||
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
|
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
|
||||||
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
|
|||||||
assert results, "НЛР returned no results"
|
assert results, "НЛР returned no results"
|
||||||
assert all(r["source"] == "nlr" for r in results)
|
assert all(r["source"] == "nlr" for r in results)
|
||||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||||
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
assert all(r["isbn"] == "" for r in results)
|
||||||
|
assert all(r["publisher"] == "" for r in results)
|
||||||
|
|
||||||
|
|
||||||
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
|
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
|
||||||
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
|
|||||||
|
|
||||||
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
|
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
|
||||||
def test_shpl_voina_i_mir() -> None:
|
def test_shpl_voina_i_mir() -> None:
|
||||||
plugin = HtmlScraperPlugin(
|
plugin = ShplPlugin(
|
||||||
plugin_id="shpl",
|
plugin_id="shpl",
|
||||||
name="ШПИЛ",
|
name="ШПИЛ",
|
||||||
rate_limiter=_RL,
|
rate_limiter=_RL,
|
||||||
rate_limit_seconds=0,
|
rate_limit_seconds=0,
|
||||||
auto_queue=False,
|
auto_queue=False,
|
||||||
timeout=_TIMEOUT,
|
timeout=_TIMEOUT,
|
||||||
config={
|
config={},
|
||||||
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
|
|
||||||
"search_param": "S21ALL",
|
|
||||||
"extra_params": {
|
|
||||||
"C21COM": "S",
|
|
||||||
"I21DBN": "BIBL",
|
|
||||||
"P21DBN": "BIBL",
|
|
||||||
"S21FMT": "briefWebRus",
|
|
||||||
"Z21ID": "",
|
|
||||||
},
|
|
||||||
"brief_class": "brief",
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
results = plugin.search("Война и мир")
|
results = plugin.search("Война и мир")
|
||||||
assert results, "ШПИЛ returned no results"
|
assert results, "ШПИЛ returned no results"
|
||||||
assert all(r["source"] == "shpl" for r in results)
|
assert all(r["source"] == "shpl" for r in results)
|
||||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||||
|
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||||
|
assert all(r["isbn"] == "" for r in results)
|
||||||
|
assert all(r["publisher"] == "" for r in results)
|
||||||
|
|||||||
Reference in New Issue
Block a user