Source code for veupath_chatbot.services.gene_lookup.organism
"""Organism fuzzy matching for gene lookup."""
import re
from veupath_chatbot.integrations.veupathdb.site_search import strip_html_tags
[docs]
def score_organism_match(query: str, organism: str) -> float:
"""Score how well *query* matches *organism* (0.0 = no match, 1.0 = exact).
Handles exact match, substring, genus abbreviation (``P. falciparum``),
organism codes (``pf3d7``), and token-level overlap.
"""
q = query.strip().lower()
o = organism.strip().lower()
if not q or not o:
return 0.0
if q == o:
return 1.0
if q in o:
return 0.85
abbrev_match = re.match(r"^([a-z])\.?\s+(.+)$", q)
if abbrev_match:
genus_initial = abbrev_match.group(1)
rest = abbrev_match.group(2)
if o.startswith(genus_initial) and rest in o:
return 0.80
o_words = o.split()
if len(o_words) >= 2:
gs_initials = o_words[0][0] + o_words[1][0]
strain_part = "".join(o_words[2:])
compact = (gs_initials + strain_part).lower()
q_nospace = (
q.replace(" ", "")
.replace(".", "")
.replace("-", "")
.replace("_", "")
.rstrip("*")
)
if q_nospace == compact:
return 0.75
if q_nospace.startswith(compact) and len(q_nospace) <= len(compact) + 2:
return 0.72
if "_" in q:
prefix = q.split("_", 1)[0].replace(".", "").replace("-", "").lower()
if prefix == compact:
return 0.72
if prefix.startswith(compact) and len(prefix) <= len(compact) + 2:
return 0.68
q_tokens = set(q.split())
o_tokens = set(o.split())
if q_tokens and q_tokens.issubset(o_tokens):
return 0.65
if q_tokens and all(any(qt in ot for ot in o_tokens) for qt in q_tokens):
return 0.55
return 0.0
[docs]
def suggest_organisms(
query: str,
available: list[str],
*,
max_suggestions: int = 5,
min_score: float = 0.40,
) -> list[str]:
"""Return organism names from *available* that fuzzy-match *query*.
:param query: User's organism input.
:param available: List of canonical organism names (from site-search).
:param max_suggestions: Maximum suggestions to return.
:param min_score: Minimum match score to include.
:returns: Suggested organism names, best match first.
"""
if not query or not available:
return []
scored: list[tuple[float, str]] = []
for org in available:
s = score_organism_match(query, org)
if s >= min_score:
scored.append((s, org))
scored.sort(key=lambda x: (-x[0], x[1]))
return [name for _, name in scored[:max_suggestions]]
[docs]
def normalize_organism(raw: str) -> str:
"""Clean organism string; handle JSON array format from site-search."""
s = strip_html_tags(raw or "")
if not s:
return ""
s = s.strip()
if s.startswith("[") and s.endswith("]"):
try:
import json
parsed = json.loads(s)
if isinstance(parsed, list) and parsed:
return strip_html_tags(str(parsed[0])).strip()
except ValueError, TypeError:
pass
return s