"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
from models import CandidateRecord
from ..rate_limiter import RateLimiter
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
``Name I.N.`` work correctly.
Args:
html: Raw HTML string to search.
cls_frag: Substring that must appear in the class attribute value.
min_len: Minimum length of extracted text to keep.
max_len: Maximum length of extracted text to keep.
Returns:
Up to three non-empty text strings in document order.
"""
raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)', html, re.DOTALL)
out: list[str] = []
for m in raw:
text = re.sub(r"<[^>]+>", "", m)
text = re.sub(r"\s+", " ", text).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from
tags, normalising whitespace.
Args:
html: Raw HTML string to search.
min_len: Minimum character length to include.
max_len: Maximum character length to include.
Returns:
Up to three non-empty, whitespace-normalised alt strings.
"""
alts = re.findall(r'
]+alt=[\'"]([^\'"]+)[\'"]', html)
out: list[str] = []
for a in alts:
text = re.sub(r"\s+", " ", a).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
class HtmlScraperPlugin:
"""Base class for HTML-scraping archive plugins.
Handles common initialisation; subclasses implement search() with
site-specific hardcoded logic. The config dict is accepted for
registry compatibility but is not used by the base class; all scraping
details are hardcoded in the subclass.
"""
category = "archive_searchers"
def __init__(
self,
plugin_id: str,
name: str,
rate_limiter: RateLimiter,
rate_limit_seconds: float,
auto_queue: bool,
timeout: int,
config: dict[str, Any],
):
self.plugin_id = plugin_id
self.name = name
self._rl = rate_limiter
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
raise NotImplementedError