Initial commit
Photo-based book cataloger with AI identification. Room → Cabinet → Shelf → Book hierarchy; FastAPI + SQLite backend; vanilla JS SPA; OpenAI-compatible plugin system for boundary detection, text recognition, and archive search.
This commit is contained in:
121
src/plugins/archives/html_scraper.py
Normal file
121
src/plugins/archives/html_scraper.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from models import CandidateRecord
|
||||
|
||||
from ..rate_limiter import RateLimiter
|
||||
|
||||
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
||||
|
||||
|
||||
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
|
||||
return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
|
||||
|
||||
|
||||
class HtmlScraperPlugin:
|
||||
"""
|
||||
Config-driven HTML scraper. Supported config keys:
|
||||
url — search URL
|
||||
search_param — query param name
|
||||
extra_params — dict of fixed extra query parameters
|
||||
title_class — CSS class fragment for title elements (class-based strategy)
|
||||
author_class — CSS class fragment for author elements
|
||||
link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
|
||||
brief_class — CSS class for brief record rows (brief strategy, e.g. shpl)
|
||||
"""
|
||||
|
||||
category = "archive_searchers"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
plugin_id: str,
|
||||
name: str,
|
||||
rate_limiter: RateLimiter,
|
||||
rate_limit_seconds: float,
|
||||
auto_queue: bool,
|
||||
timeout: int,
|
||||
config: dict[str, Any],
|
||||
):
|
||||
self.plugin_id = plugin_id
|
||||
self.name = name
|
||||
self._rl = rate_limiter
|
||||
self.rate_limit_seconds = rate_limit_seconds
|
||||
self.auto_queue = auto_queue
|
||||
self.timeout = timeout
|
||||
self.config = config
|
||||
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
|
||||
|
||||
def search(self, query: str) -> list[CandidateRecord]:
|
||||
cfg = self.config
|
||||
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
|
||||
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
|
||||
params[cfg["search_param"]] = query
|
||||
r = httpx.get(
|
||||
cfg["url"],
|
||||
params=params,
|
||||
timeout=self.timeout,
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
)
|
||||
html = r.text
|
||||
years = _YEAR_RE.findall(html)
|
||||
|
||||
# Strategy: link_href_pattern (alib-style)
|
||||
if "link_href_pattern" in cfg:
|
||||
return self._parse_link(html, years, cfg)
|
||||
|
||||
# Strategy: brief_class (shpl-style)
|
||||
if "brief_class" in cfg:
|
||||
return self._parse_brief(html, years, cfg)
|
||||
|
||||
# Strategy: title_class + author_class (rusneb-style)
|
||||
return self._parse_class(html, years, cfg)
|
||||
|
||||
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
|
||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title.strip(),
|
||||
author=authors[i].strip() if i < len(authors) else "",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, title in enumerate(titles)
|
||||
]
|
||||
|
||||
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
href_pat = cfg.get("link_href_pattern", r"")
|
||||
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
|
||||
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=title.strip(),
|
||||
author=authors[i].strip() if i < len(authors) else "",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, title in enumerate(titles)
|
||||
]
|
||||
|
||||
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
|
||||
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=t.strip(),
|
||||
author="",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, t in enumerate(titles)
|
||||
]
|
||||
Reference in New Issue
Block a user