Files
bookshelf/config/functions.default.yaml
Petr Polezhaev b8f82607f9 Fix archive plugins for НЭБ and Alib; add network integration tests
- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text
  strategy (Alib entries from <p><b>), Windows-1251 encoding support,
  _cls_inner_texts() helper that strips inner HTML tags
- rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL
  title:(words) AND author:(word) query format
- config: update rusneb (img_alt + correct author_class) and alib_web
  (encoding + bold_text) to match fixed plugin strategies
- tests: add tests/test_archives.py with network-marked tests for all six
  archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404)
- presubmit: exclude network tests from default run (-m "not network")

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 22:59:19 +03:00

104 lines
2.9 KiB
YAML

# Function configurations — dict per category (not lists).
# AI functions reference a model from models.*.yaml.
# Archive functions specify a type and optional config dict.
# Keys within each category serve as plugin_id; must be unique across all categories.
# Override individual functions in functions.user.yaml.
functions:
# ── Boundary detection: image → {boundaries: [...], confidence: 0.x}
# ai_shelf_boundaries / ai_book_boundaries stored as {functionId: [fractions]} per entity.
boundary_detectors:
shelves: # key = plugin_id = target; runs on cabinet images
model: vl_detect_shelves
max_image_px: 1600
auto_queue: false
rate_limit_seconds: 0
timeout: 30
books: # key = plugin_id = target; runs on shelf images
model: vl_detect_books
max_image_px: 1600
auto_queue: false
rate_limit_seconds: 0
timeout: 30
# ── Text recognition: spine image → {raw_text, title, author, year, publisher, other}
text_recognizers:
recognize:
model: vl_recognize
max_image_px: 1600
auto_queue: true
rate_limit_seconds: 0
timeout: 30
# ── Book identification: raw_text → {title, author, year, isbn, publisher, confidence}
book_identifiers:
identify:
model: ai_identify
confidence_threshold: 0.8
auto_queue: false
rate_limit_seconds: 0
timeout: 30
# ── Archive searchers: query → [{source, title, author, year, isbn, publisher}, ...]
archive_searchers:
openlibrary:
name: "OpenLibrary"
type: openlibrary
auto_queue: true
rate_limit_seconds: 5
timeout: 8
rsl:
name: "РГБ"
type: rsl
auto_queue: true
rate_limit_seconds: 5
timeout: 8
rusneb:
name: "НЭБ"
type: html_scraper
auto_queue: true
rate_limit_seconds: 5
timeout: 8
config:
url: "https://rusneb.ru/search/"
search_param: q
img_alt: true
author_class: "search-list__item_subtext"
alib_web:
name: "Alib (web)"
type: html_scraper
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "https://www.alib.ru/find3.php4"
search_param: tfind
extra_params: {f: "5", s: "0"}
encoding: "cp1251"
bold_text: true
nlr:
name: "НЛР"
type: sru_catalog
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "http://www.nlr.ru/search/query"
query_prefix: "title="
shpl:
name: "ШПИЛ"
type: html_scraper
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
search_param: S21ALL
extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""}
brief_class: "brief"