Source code for veupath_chatbot.services.research.clients.preprint

"""Preprint site search client (bioRxiv, medRxiv)."""

import asyncio
import re
from typing import Literal

import httpx

from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
    BaseClient,
    build_response,
    make_citation,
)
from veupath_chatbot.services.research.utils import (
    BROWSER_USER_AGENT,
    decode_ddg_redirect,
    fetch_page_summary,
    strip_tags,
)


[docs] class PreprintClient(BaseClient): """Client for preprint site searches via DuckDuckGo. Preprint search has a unique signature (``site``, ``source``, ``include_abstract``) and a post-processing step that fetches page summaries, so it keeps a custom ``search`` method. Per-item parsing still goes through ``_parse_item`` / ``_build_results``. """ _current_source: Literal["biorxiv", "medrxiv"] = "biorxiv"
[docs] async def search( self, query: str, *, site: str, source: Literal["biorxiv", "medrxiv"], limit: int, include_abstract: bool, abstract_max_chars: int, ) -> JSONObject: """Search preprint sites using DuckDuckGo.""" raw_items = await self._fetch_raw(query, site=site, limit=limit) self._current_source: Literal["biorxiv", "medrxiv"] = source results, citations = self._build_results( raw_items, abstract_max_chars=abstract_max_chars ) if include_abstract and results: dict_results = [r for r in results if isinstance(r, dict)] async with httpx.AsyncClient( timeout=min(self._timeout, 15.0), headers={ "User-Agent": BROWSER_USER_AGENT, "Accept-Language": "en-US,en;q=0.9", }, ) as client: summaries = await asyncio.gather( *[ fetch_page_summary( client, r.get("url"), max_chars=abstract_max_chars ) for r in dict_results ], return_exceptions=True, ) for r, s in zip(dict_results, summaries, strict=True): if isinstance(s, str) and s.strip(): r["abstract"] = s.strip() r["snippet"] = s.strip() return build_response( query=query, source=source, results=results, citations=citations )
# -- fetch ------------------------------------------------------------- async def _fetch_raw(self, query: str, *, site: str, limit: int) -> list[JSONValue]: ddg_url = "https://duckduckgo.com/html/" params = {"q": f"site:{site} {query}"} headers = {"User-Agent": "pathfinder-planner/1.0"} async with httpx.AsyncClient(timeout=self._timeout, headers=headers) as client: resp = await client.get(ddg_url, params=params, follow_redirects=True) resp.raise_for_status() html = resp.text or "" items: list[JSONValue] = [] for m in re.finditer( r'class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>', html, flags=re.IGNORECASE, ): if len(items) >= limit: break items.append( { "_title": strip_tags(m.group(2)), "_url": decode_ddg_redirect(m.group(1)), } ) return items # -- parse ------------------------------------------------------------- def _parse_item( self, raw: JSONValue, *, abstract_max_chars: int ) -> tuple[JSONObject, JSONObject] | None: if not isinstance(raw, dict): return None title = str(raw.get("_title") or "") url_str = raw.get("_url") url_str = url_str if isinstance(url_str, str) else None source = self._current_source result: JSONObject = {"title": title, "url": url_str, "snippet": None} citation = make_citation( source=source, id_prefix=source, title=title or (url_str or f"{source} result"), url=url_str, ) return result, citation