Source code for veupath_chatbot.services.experiment.classification
"""Gene record classification by experiment category membership.
Classifies WDK result records as TP / FP / FN / TN based on
whether their gene ID appears in the experiment's curated gene sets.
Handles WDK transcript ID version suffixes (e.g. "GENE.1" -> "GENE").
"""
from veupath_chatbot.platform.types import JSONObject
from veupath_chatbot.services.wdk.helpers import extract_pk
[docs]
def classify_records(
records: list[JSONObject],
tp_ids: set[str],
fp_ids: set[str],
fn_ids: set[str],
tn_ids: set[str],
) -> list[JSONObject]:
"""Add ``_classification`` field to records based on gene ID membership.
For each record, extracts the primary key and checks membership in
the four gene-set categories. WDK transcript IDs may include a
version suffix (e.g. ``"PF3D7_0100100.1"``); the function also
checks the base ID with the suffix stripped.
:param records: WDK answer records (list of dicts).
:param tp_ids: True-positive gene IDs.
:param fp_ids: False-positive gene IDs.
:param fn_ids: False-negative gene IDs.
:param tn_ids: True-negative gene IDs.
:returns: New list of records, each with a ``_classification`` field.
"""
classified: list[JSONObject] = []
for rec in records:
if not isinstance(rec, dict):
continue
gene_id = extract_pk(rec)
classification = _classify_gene_id(gene_id, tp_ids, fp_ids, fn_ids, tn_ids)
classified.append({**rec, "_classification": classification})
return classified
def _classify_gene_id(
gene_id: str | None,
tp_ids: set[str],
fp_ids: set[str],
fn_ids: set[str],
tn_ids: set[str],
) -> str | None:
"""Return the classification label for a single gene ID, or ``None``."""
if not gene_id:
return None
# WDK transcript IDs include a version suffix (e.g. ".1").
# Experiment gene sets store the base gene ID without it.
candidates = [gene_id]
dot = gene_id.rfind(".")
if dot > 0:
candidates.append(gene_id[:dot])
for gid in candidates:
if gid in tp_ids:
return "TP"
if gid in fp_ids:
return "FP"
if gid in fn_ids:
return "FN"
if gid in tn_ids:
return "TN"
return None