Source code for veupath_chatbot.ai.tools.query_validation

from typing import cast

from veupath_chatbot.platform.types import JSONArray, JSONObject

VAGUE_RECORD_TYPE_TOKENS = {
    "gene",
    "genes",
    "transcript",
    "transcripts",
    "record",
    "records",
    "type",
    "types",
    "feature",
    "features",
}


[docs] def tokenize_query(text: str) -> list[str]: import re return re.findall(r"[A-Za-z0-9][A-Za-z0-9._-]{2,}", (text or "").lower())
[docs] def record_type_query_error(query: str) -> JSONObject | None: """Return a validation error object if the query is too vague; otherwise None. :param query: User query string. :returns: Error dict if query is too vague, otherwise None. """ q = (query or "").strip() if not q: return None tokens = tokenize_query(q) if len(tokens) < 2: return { "error": "query_too_vague", "message": "get_record_types(query=...) requires 2+ specific keywords; one-word queries are rejected.", "query": q, "examples": [ "gametocyte RNA-seq", "single cell atlas", "vector salivary gland", "metabolic pathway", ], "avoid": ["gene", "transcript", "record type"], } # Reject queries made only of generic tokens (e.g. "gene transcript"). if all(t in VAGUE_RECORD_TYPE_TOKENS for t in tokens): return { "error": "query_too_vague", "message": "Query is too generic; include at least one domain-specific keyword (not only 'gene'/'transcript').", "query": q, "tokens": cast(JSONArray, tokens), } return None
[docs] def search_query_error(query: str, *, has_keywords: bool = False) -> JSONObject | None: """Return a validation error object if the query is invalid; otherwise None. When ``has_keywords`` is True, short/vague queries are allowed because the keywords provide the specificity. :param query: User query string. :param has_keywords: Whether the caller also provided keyword hints. :returns: Error dict if query is invalid or too vague, otherwise None. """ q = (query or "").strip() if not q and not has_keywords: return { "error": "query_required", "message": "search_for_searches(query=...) requires a non-empty query.", } tokens = tokenize_query(q) if len(tokens) < 2 and not has_keywords: return { "error": "query_too_vague", "message": "search_for_searches(query=...) requires 2+ specific keywords; one-word/vague queries are rejected.", "query": q, "examples": [ "vector salivary gland", "gametocyte RNA-seq", "drug resistance markers", "liver stage expression", ], } return None