Source code for veupath_chatbot.services.gene_sets.ensemble
"""Ensemble gene scoring — frequency across multiple gene sets."""
from collections import Counter
from typing import TypedDict
[docs]
class EnsembleScore(TypedDict):
"""A single gene's ensemble score."""
geneId: str
frequency: float
count: int
total: int
inPositives: bool
[docs]
def compute_ensemble_scores(
gene_sets: list[list[str]],
positive_controls: list[str] | None = None,
) -> list[EnsembleScore]:
"""Score genes by how frequently they appear across gene sets.
Returns a list of EnsembleScore dicts sorted by frequency (desc),
then gene ID (asc).
"""
if not gene_sets:
return []
total = len(gene_sets)
counts: Counter[str] = Counter()
for gs in gene_sets:
counts.update(gs)
positives = set(positive_controls) if positive_controls else set()
scores: list[EnsembleScore] = [
EnsembleScore(
geneId=gene_id,
frequency=count / total,
count=count,
total=total,
inPositives=gene_id in positives,
)
for gene_id, count in counts.items()
]
scores.sort(key=lambda r: (-r["frequency"], r["geneId"]))
return scores