Source code for veupath_chatbot.services.gene_lookup.scoring
"""Gene-specific relevance scoring for text search results."""
from veupath_chatbot.platform.types import JSONObject
from veupath_chatbot.services.search_rerank import (
score_field_quality,
score_text_match,
)
from .organism import score_organism_match
_W_GENE_ID = 100.0
_W_GENE_NAME = 40.0
_W_ORGANISM = 30.0
_W_PRODUCT = 35.0
_W_DISPLAY_NAME = 25.0
_W_FIELD_QUALITY = 20.0
_EXACT_BONUS = 80.0
[docs]
def score_gene_relevance(query: str, result: JSONObject) -> float:
"""Score a gene result's relevance to *query*.
Higher is better. The score is an additive combination of how well
the query matches the gene ID, gene name, organism, and product,
plus a bonus/penalty based on which site-search fields matched.
An extra bonus is awarded when the query exactly matches a descriptive
field (product, displayName) so that exact hits always rank above
incidental fuzzy overlap from shared tokens like "alpha" or "2".
"""
gene_id = str(result.get("geneId", ""))
gene_name = str(result.get("geneName", ""))
display_name = str(result.get("displayName", ""))
organism = str(result.get("organism", ""))
product = str(result.get("product", ""))
matched_fields = result.get("matchedFields")
mf_list = matched_fields if isinstance(matched_fields, list) else []
mf_list_str: list[str] = [x for x in mf_list if isinstance(x, str)]
id_score = score_text_match(query, gene_id)
name_score = score_text_match(query, gene_name)
disp_score = score_text_match(query, display_name)
prod_score = score_text_match(query, product)
score = 0.0
score += _W_GENE_ID * id_score
score += _W_GENE_NAME * name_score
score += _W_DISPLAY_NAME * disp_score
score += _W_ORGANISM * score_organism_match(query, organism)
score += _W_PRODUCT * prod_score
score += _W_FIELD_QUALITY * score_field_quality(mf_list_str)
# Exact/near-exact match bonus — ensures "alpha tubulin 2" beats
# "casein kinase 2, alpha subunit" which only shares tokens.
best_desc = max(prod_score, disp_score, name_score)
if best_desc >= 0.95:
score += _EXACT_BONUS * best_desc
return score