Source code for veupath_chatbot.services.gene_lookup.scoring

"""Gene-specific relevance scoring for text search results."""

from veupath_chatbot.platform.types import JSONObject
from veupath_chatbot.services.search_rerank import (
    score_field_quality,
    score_text_match,
)

from .organism import score_organism_match

_W_GENE_ID = 100.0
_W_GENE_NAME = 40.0
_W_ORGANISM = 30.0
_W_PRODUCT = 35.0
_W_DISPLAY_NAME = 25.0
_W_FIELD_QUALITY = 20.0
_EXACT_BONUS = 80.0



[docs]
def score_gene_relevance(query: str, result: JSONObject) -> float:
    """Score a gene result's relevance to *query*.

    Higher is better.  The score is an additive combination of how well
    the query matches the gene ID, gene name, organism, and product,
    plus a bonus/penalty based on which site-search fields matched.

    An extra bonus is awarded when the query exactly matches a descriptive
    field (product, displayName) so that exact hits always rank above
    incidental fuzzy overlap from shared tokens like "alpha" or "2".
    """
    gene_id = str(result.get("geneId", ""))
    gene_name = str(result.get("geneName", ""))
    display_name = str(result.get("displayName", ""))
    organism = str(result.get("organism", ""))
    product = str(result.get("product", ""))
    matched_fields = result.get("matchedFields")
    mf_list = matched_fields if isinstance(matched_fields, list) else []
    mf_list_str: list[str] = [x for x in mf_list if isinstance(x, str)]

    id_score = score_text_match(query, gene_id)
    name_score = score_text_match(query, gene_name)
    disp_score = score_text_match(query, display_name)
    prod_score = score_text_match(query, product)

    score = 0.0
    score += _W_GENE_ID * id_score
    score += _W_GENE_NAME * name_score
    score += _W_DISPLAY_NAME * disp_score
    score += _W_ORGANISM * score_organism_match(query, organism)
    score += _W_PRODUCT * prod_score
    score += _W_FIELD_QUALITY * score_field_quality(mf_list_str)

    # Exact/near-exact match bonus — ensures "alpha tubulin 2" beats
    # "casein kinase 2, alpha subunit" which only shares tokens.
    best_desc = max(prod_score, disp_score, name_score)
    if best_desc >= 0.95:
        score += _EXACT_BONUS * best_desc

    return score