Initial commit

Photo-based book cataloger with AI identification. Room → Cabinet → Shelf → Book hierarchy; FastAPI + SQLite backend; vanilla JS SPA; OpenAI-compatible plugin system for boundary detection, text recognition, and archive search.
2026-03-09 14:17:13 +03:00
commit 084d1aebd5
64 changed files with 8605 additions and 0 deletions
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -0,0 +1,121 @@
+"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
+
+import re
+from typing import Any
+from urllib.parse import urlparse
+
+import httpx
+
+from models import CandidateRecord
+
+from ..rate_limiter import RateLimiter
+
+_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+
+
+def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
+    return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
+
+
+class HtmlScraperPlugin:
+    """
+    Config-driven HTML scraper. Supported config keys:
+      url           — search URL
+      search_param  — query param name
+      extra_params  — dict of fixed extra query parameters
+      title_class   — CSS class fragment for title elements (class-based strategy)
+      author_class  — CSS class fragment for author elements
+      link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
+      brief_class   — CSS class for brief record rows (brief strategy, e.g. shpl)
+    """
+
+    category = "archive_searchers"
+
+    def __init__(
+        self,
+        plugin_id: str,
+        name: str,
+        rate_limiter: RateLimiter,
+        rate_limit_seconds: float,
+        auto_queue: bool,
+        timeout: int,
+        config: dict[str, Any],
+    ):
+        self.plugin_id = plugin_id
+        self.name = name
+        self._rl = rate_limiter
+        self.rate_limit_seconds = rate_limit_seconds
+        self.auto_queue = auto_queue
+        self.timeout = timeout
+        self.config = config
+        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        cfg = self.config
+        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
+        params: dict[str, Any] = dict(cfg.get("extra_params") or {})
+        params[cfg["search_param"]] = query
+        r = httpx.get(
+            cfg["url"],
+            params=params,
+            timeout=self.timeout,
+            headers={"User-Agent": "Mozilla/5.0"},
+        )
+        html = r.text
+        years = _YEAR_RE.findall(html)
+
+        # Strategy: link_href_pattern (alib-style)
+        if "link_href_pattern" in cfg:
+            return self._parse_link(html, years, cfg)
+
+        # Strategy: brief_class (shpl-style)
+        if "brief_class" in cfg:
+            return self._parse_brief(html, years, cfg)
+
+        # Strategy: title_class + author_class (rusneb-style)
+        return self._parse_class(html, years, cfg)
+
+    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
+        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
+        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=title.strip(),
+                author=authors[i].strip() if i < len(authors) else "",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, title in enumerate(titles)
+        ]
+
+    def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
+        href_pat = cfg.get("link_href_pattern", r"")
+        titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
+        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=title.strip(),
+                author=authors[i].strip() if i < len(authors) else "",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, title in enumerate(titles)
+        ]
+
+    def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
+        titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=t.strip(),
+                author="",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, t in enumerate(titles)
+        ]