Source code for veupath_chatbot.services.gene_lookup.enrich

"""Enrich sparse gene results with WDK metadata."""

from collections.abc import Callable

from veupath_chatbot.integrations.veupathdb.site_search import strip_html_tags
from veupath_chatbot.platform.logging import get_logger
from veupath_chatbot.platform.types import JSONObject

from .organism import normalize_organism
from .wdk import resolve_gene_ids

logger = get_logger(__name__)

# Fields to fill from WDK metadata when absent in the original result.
# Each entry is (field_name, transform_fn).
_ENRICHMENT_FIELDS: list[tuple[str, Callable[[str], str]]] = [
    ("organism", normalize_organism),
    ("product", strip_html_tags),
    ("geneName", strip_html_tags),
    ("geneType", str),
    ("location", str),
]


def _merge_meta(merged: JSONObject, meta: JSONObject) -> None:
    """Fill empty fields in *merged* from *meta*, applying transforms."""
    for field, transform in _ENRICHMENT_FIELDS:
        if not merged.get(field) and meta.get(field):
            merged[field] = transform(str(meta[field]))
    if not merged.get("displayName"):
        merged["displayName"] = str(
            merged.get("geneName")
            or merged.get("product")
            or meta.get("product")
            or merged.get("geneId")
            or ""
        )


[docs] async def enrich_sparse_gene_results( site_id: str, results: list[JSONObject], limit: int, ) -> list[JSONObject]: """Enrich results that lack organism/product via WDK standard reporter. Site-search only returns ``summaryFieldData`` for fields where the query matched. When a gene matches in literature (e.g. MULTIgene_PubMed), organism/product are absent. We fetch full metadata from the WDK to fill the gaps. """ ids_to_enrich: list[str] = [ str(r["geneId"]) for r in results if isinstance(r, dict) and r.get("geneId") and (not r.get("organism") or not r.get("product")) ] if not ids_to_enrich: return results try: resolved = await resolve_gene_ids( site_id, ids_to_enrich[:50], record_type="transcript", ) except Exception as exc: logger.debug( "Gene enrichment via WDK skipped", site_id=site_id, count=len(ids_to_enrich), error=str(exc), ) return results if resolved.get("error"): return results records = resolved.get("records") if not isinstance(records, list) or not records: return results by_id: dict[str, JSONObject] = {} for rec in records: if isinstance(rec, dict) and rec.get("geneId"): by_id[str(rec["geneId"]).strip()] = rec enriched: list[JSONObject] = [] for r in results: if not isinstance(r, dict): enriched.append(r) continue gene_id = r.get("geneId") meta = by_id.get(str(gene_id or "")) if gene_id else None if not meta: enriched.append(r) continue merged = dict(r) _merge_meta(merged, meta) enriched.append(merged) return enriched[:limit]