## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.
"""phenoscore module
This module contains functions for calculating relative phenotypes from CRISPR screens
datasets.
"""
import numpy as np
import anndata as ad
import pandas as pd
from .delta import (
compareByReplicates, compareByTargetGroup,
getPhenotypeData,
calculateDelta,
getBestTargetByTSS,
generatePseudoGeneAnnData
)
from .deseq import runDESeq, extractDESeqResults
from ._annotate import annotateScoreTable
from .phenostat import matrixStat, multipleTestsCorrection
[docs]def runPhenoScore(adata, cond_ref, cond_test, score_level,
var_names='target', collapse_var=False,
test='ttest', growth_rate=1, n_reps='auto', keep_top_n = None,
num_pseudogenes='auto', pseudogene_size='auto',
count_layer=None, count_filter_type='mean', count_filter_threshold=40,
ctrl_label='negative_control'
):
"""Calculate phenotype score and p-values when comparing `cond_test` vs `cond_ref`.
Args:
adata (AnnData): AnnData object
cond_ref (str): condition reference
cond_test (str): condition test
score_level (str): score level
var_names (str): variable names to use as index in the result dataframe
collapse_var (str): variable to use for `getBestTargetByTSS` function, default is False
test (str): test to use for calculating p-value ('MW': Mann-Whitney U rank; 'ttest' : t-test)
growth_rate (int): growth rate
n_reps (int): number of replicates
keep_top_n (int): number of top guides to keep per target
num_pseudogenes (int): number of pseudogenes to generate
pseudogene_size (int): number of sgRNA elements in each pseudogene
count_layer (str): count layer to use for calculating score, default is None (use default count layer in adata.X)
count_filter_type (str): filter type for counts, default is 'mean'
count_filter_threshold (int): filter threshold for counts, default is 40
ctrl_label (str): control label, default is 'negative_control'
Returns:
str: result name
pd.DataFrame: result dataframe
"""
adat = adata.copy()
if 'condition' not in adat.obs.columns:
raise ValueError("The AnnData object must have a 'condition' column in its obs.")
# format result name
result_name = f'{cond_test}_vs_{cond_ref}'
print(f'\t{cond_test} vs {cond_ref}')
# set n_reps if not provided
if n_reps == 'auto':
n_reps = min(
adat.obs.query(f'condition=="{cond_ref}"').shape[0],
adat.obs.query(f'condition=="{cond_test}"').shape[0]
)
# check if count_layer exists
if count_layer is None:
pass
elif count_layer not in adat.layers.keys():
raise ValueError(f"Layer '{count_layer}' not found in adata.layers.keys().")
elif count_layer in adat.layers.keys():
adat.X = adat.layers[count_layer].copy()
# calc phenotype score and p-value
if score_level in ['compare_reps']:
# prep counts for phenoScore calculation
df_cond_ref = adat[adat.obs.query(f'condition=="{cond_ref}"').index[:n_reps],].to_df(count_layer).T
df_cond_test = adat[adat.obs.query(f'condition=="{cond_test}"').index[:n_reps],].to_df(count_layer).T
result = compareByReplicates(
adata=adat,
df_cond_ref=df_cond_ref,
df_cond_test=df_cond_test,
var_names=var_names,
test=test,
ctrl_label=ctrl_label,
growth_rate=growth_rate,
filter_type=count_filter_type,
filter_threshold=count_filter_threshold
)
elif score_level in ['compare_guides']:
# prep counts for phenoScore calculation
df_cond_ref = adat[adat.obs.query(f'condition=="{cond_ref}"').index].to_df().T
df_cond_test = adat[adat.obs.query(f'condition=="{cond_test}"').index].to_df().T
del df_cond_ref, df_cond_test
adat_pseudo = generatePseudoGeneAnnData(adat, num_pseudogenes=num_pseudogenes, pseudogene_size=pseudogene_size, ctrl_label=ctrl_label)
if 'transcript' in var_names: adat_pseudo.var['transcript'] = 'na'
adat_test = ad.concat([adat[:,~adat.var.targetType.eq(ctrl_label)], adat_pseudo], axis=1)
adat_test.obs = adat.obs.copy()
# prep counts for phenoScore calculation
df_cond_ref = adat_test[adat_test.obs.query(f'condition=="{cond_ref}"').index].to_df().T
df_cond_test = adat_test[adat_test.obs.query(f'condition=="{cond_test}"').index].to_df().T
result = compareByTargetGroup(
adata=adat_test,
df_cond_ref=df_cond_ref,
df_cond_test=df_cond_test,
keep_top_n=keep_top_n,
var_names=var_names,
test=test,
ctrl_label=ctrl_label,
growth_rate=growth_rate,
filter_type=count_filter_type,
filter_threshold=count_filter_threshold
)
# get the best transcript as lowest p-value for each target
if collapse_var not in [False, None]:
if collapse_var not in result.columns:
raise ValueError(f'collapse_var "{collapse_var}" not found in result columns.')
else:
result = getBestTargetByTSS(
score_df=result, target_col=collapse_var, pvalue_col=f'{test} pvalue'
)
result.index.name = None
# change target name to control label if it is a pseudo gene
result['target'] = result['target'].apply(lambda x: ctrl_label if 'pseudo' in x else x).to_list()
else:
raise ValueError(f'score_level "{score_level}" not recognized. Currently, "compare_reps" and "compare_guides" are supported.')
return result_name, result