Source code for veupath_chatbot.services.gene_lookup.site_search

"""Site-search gene fetching and document parsing."""

from typing import cast

from veupath_chatbot.integrations.veupathdb.site_search import (
    query_site_search,
    strip_html_tags,
)
from veupath_chatbot.platform.types import JSONObject

from .organism import normalize_organism
from .result import build_gene_result

SITE_SEARCH_FETCH_LIMIT = 50


[docs] def parse_site_search_docs(docs: list[object]) -> list[JSONObject]: """Convert raw site-search documents into standardised gene dicts.""" results: list[JSONObject] = [] for doc in docs: if not isinstance(doc, dict): continue gene_id = strip_html_tags(str(doc.get("wdkPrimaryKeyString", ""))).strip() if not gene_id: pk = doc.get("primaryKey") if isinstance(pk, list) and pk: gene_id = str(pk[0]).strip() if not gene_id: continue summary = doc.get("summaryFieldData") if not isinstance(summary, dict): summary = {} doc_organism = normalize_organism(summary.get("TEXT__gene_organism_full", "")) doc_product = strip_html_tags(summary.get("TEXT__gene_product", "")) doc_gene_name = strip_html_tags(summary.get("TEXT__gene_name", "")) doc_gene_type = strip_html_tags(summary.get("TEXT__gene_type", "")) hyperlink_name = strip_html_tags(str(doc.get("hyperlinkName", ""))).strip() display_name = hyperlink_name or doc_gene_name or doc_product if not doc_organism and doc.get("organism"): doc_organism = normalize_organism(str(doc.get("organism", ""))) found = doc.get("foundInFields") matched_fields: list[str] = [] if isinstance(found, dict): for field_key, field_values in found.items(): if isinstance(field_values, list) and field_values: clean_key = field_key.replace("MULTITEXT__", "").replace( "TEXT__", "" ) matched_fields.append(clean_key) results.append( build_gene_result( gene_id=gene_id, display_name=display_name, organism=doc_organism, product=doc_product, gene_name=doc_gene_name, gene_type=doc_gene_type, matched_fields=matched_fields, ) ) return results
[docs] async def fetch_site_search_genes( site_id: str, search_text: str, *, organisms: list[str] | None = None, limit: int = SITE_SEARCH_FETCH_LIMIT, ) -> tuple[list[JSONObject], list[str], int]: """Run a single site-search query and return parsed results. :returns: ``(gene_results, available_organisms, total_count)`` """ data = await query_site_search( site_id, search_text=search_text, document_type="gene", organisms=organisms, limit=limit, offset=0, ) data_dict = data if isinstance(data, dict) else {} sr_raw = data_dict.get("searchResults") sr = sr_raw if isinstance(sr_raw, dict) else {} total = sr.get("totalCount", 0) docs = sr.get("documents") doc_list = docs if isinstance(docs, list) else [] results = parse_site_search_docs(cast(list[object], doc_list)) org_counts = data_dict.get("organismCounts") orgs = sorted(org_counts.keys()) if isinstance(org_counts, dict) else [] return results, orgs, total if isinstance(total, int) else len(results)