"""Preprint site search client (bioRxiv, medRxiv)."""
import asyncio
import re
from typing import Literal
import httpx
from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
BaseClient,
build_response,
make_citation,
)
from veupath_chatbot.services.research.utils import (
BROWSER_USER_AGENT,
decode_ddg_redirect,
fetch_page_summary,
strip_tags,
)
[docs]
class PreprintClient(BaseClient):
"""Client for preprint site searches via DuckDuckGo.
Preprint search has a unique signature (``site``, ``source``,
``include_abstract``) and a post-processing step that fetches page
summaries, so it keeps a custom ``search`` method. Per-item parsing
still goes through ``_parse_item`` / ``_build_results``.
"""
_current_source: Literal["biorxiv", "medrxiv"] = "biorxiv"
[docs]
async def search(
self,
query: str,
*,
site: str,
source: Literal["biorxiv", "medrxiv"],
limit: int,
include_abstract: bool,
abstract_max_chars: int,
) -> JSONObject:
"""Search preprint sites using DuckDuckGo."""
raw_items = await self._fetch_raw(query, site=site, limit=limit)
self._current_source: Literal["biorxiv", "medrxiv"] = source
results, citations = self._build_results(
raw_items, abstract_max_chars=abstract_max_chars
)
if include_abstract and results:
dict_results = [r for r in results if isinstance(r, dict)]
async with httpx.AsyncClient(
timeout=min(self._timeout, 15.0),
headers={
"User-Agent": BROWSER_USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
},
) as client:
summaries = await asyncio.gather(
*[
fetch_page_summary(
client, r.get("url"), max_chars=abstract_max_chars
)
for r in dict_results
],
return_exceptions=True,
)
for r, s in zip(dict_results, summaries, strict=True):
if isinstance(s, str) and s.strip():
r["abstract"] = s.strip()
r["snippet"] = s.strip()
return build_response(
query=query, source=source, results=results, citations=citations
)
# -- fetch -------------------------------------------------------------
async def _fetch_raw(self, query: str, *, site: str, limit: int) -> list[JSONValue]:
ddg_url = "https://duckduckgo.com/html/"
params = {"q": f"site:{site} {query}"}
headers = {"User-Agent": "pathfinder-planner/1.0"}
async with httpx.AsyncClient(timeout=self._timeout, headers=headers) as client:
resp = await client.get(ddg_url, params=params, follow_redirects=True)
resp.raise_for_status()
html = resp.text or ""
items: list[JSONValue] = []
for m in re.finditer(
r'class="result__a"[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
html,
flags=re.IGNORECASE,
):
if len(items) >= limit:
break
items.append(
{
"_title": strip_tags(m.group(2)),
"_url": decode_ddg_redirect(m.group(1)),
}
)
return items
# -- parse -------------------------------------------------------------
def _parse_item(
self, raw: JSONValue, *, abstract_max_chars: int
) -> tuple[JSONObject, JSONObject] | None:
if not isinstance(raw, dict):
return None
title = str(raw.get("_title") or "")
url_str = raw.get("_url")
url_str = url_str if isinstance(url_str, str) else None
source = self._current_source
result: JSONObject = {"title": title, "url": url_str, "snippet": None}
citation = make_citation(
source=source,
id_prefix=source,
title=title or (url_str or f"{source} result"),
url=url_str,
)
return result, citation