Source code for screenpro.load

## Copyright (c) 2022-2024 ScreenPro2 Development Team.
## All rights reserved.
## Gilbart Lab, UCSF / Arc Institute.
## Multi-Omics Tech Center, Arc Insititue.

"""Load module

Functions to load screen datasets and sgRNA library tables.
"""

import pickle
import pandas as pd


[docs]def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True, **args): '''Load Cas9 sgRNA library table for single or dual guide design. ''' library = pd.read_csv( library_path, sep=sep, index_col=index_col, **args ) ## Evaluate library table and reformat columns for downstream analysis # I would like to name the target column 'target' if it is named 'gene'! #TODO: Add option to keep sublibrary column! if library_type == "single_guide_design": eval_columns = ['target', 'sgID', 'protospacer', 'sequence'] # reformating columns as needed if 'gene' in library.columns: # rename gene column to target library = library.rename(columns={'gene': 'target'}) if 'sequence' in library.columns and 'protospacer' not in library.columns: library.rename(columns={'sequence': 'protospacer'}, inplace=True) if 'sgId' in library.columns: library.rename(columns={'sgId': 'sgID'}, inplace=True) # Upper case protospacer sequences library['protospacer'] = library['protospacer'].str.upper() protospacer_col = 'protospacer' in_length = _check_protospacer_length(library, 'protospacer') if in_length == protospacer_length: pass elif in_length > protospacer_length: if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.") library = _trim_protospacer( library, protospacer_col, '5prime', in_length - protospacer_length ) elif in_length < protospacer_length: raise ValueError( f"Input protospacer length for '{protospacer_col}' is less than {protospacer_length}" ) # write `sequence` column as `protospacer` (after trimming) library['sequence'] = library['protospacer'] for col in eval_columns: if col not in library.columns: raise ValueError(f"Column '{col}' not found in library table.") library = library[eval_columns] elif library_type == "dual_guide_design": eval_columns = [ 'target', 'sgID_AB', 'sgID_A', 'protospacer_A', 'sgID_B', 'protospacer_B', 'sequence' ] # reformating columns as needed if 'gene' in library.columns: # rename gene column to target library = library.rename(columns={'gene': 'target'}) # Upper case protospacer sequences library['protospacer_A'] = library['protospacer_A'].str.upper() library['protospacer_B'] = library['protospacer_B'].str.upper() # # TODO: Enable trimming of protospacer sequences through command line arguments. for protospacer_col in ['protospacer_A', 'protospacer_B']: in_length = _check_protospacer_length(library, protospacer_col) if in_length == protospacer_length: pass elif in_length > protospacer_length: if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.") library = _trim_protospacer( library, protospacer_col, '5prime', in_length - protospacer_length ) elif in_length < protospacer_length: raise ValueError( f"Input protospacer length for '{protospacer_col}' is less than {protospacer_length}" ) # write `sequence` column as `protospacer_A;protospacer_B` (after trimming) library['sequence'] = library['protospacer_A'] + ';' + library['protospacer_B'] for col in eval_columns: if col not in library.columns: raise ValueError(f"Column '{col}' not found in library table.") library = library[eval_columns] else: raise ValueError(f"Invalid library type: {library_type}. Please choose 'single_guide_design' or 'dual_guide_design'.") if verbose: print("Library table successfully loaded.") return library
[docs]def loadScreenProcessingData(experimentName, collapsedToTranscripts=True, premergedCounts=False): """ Load ScreenProcessing outputs (see original code `here <https://github.com/mhorlbeck/ScreenProcessing/blob/master/screen_analysis.py#L70>`__) Input files: * `*_librarytable.txt` => library table * `*_mergedcountstable.txt` => merged counts table * `*_phenotypetable.txt` => phenotype table Parameters: experimentName (str): name of the experiment collapsedToTranscripts (bool): whether the gene scores are collapsed to transcripts premergedCounts (bool): whether the counts are premerged Returns: dict: dictionary of dataframes """ # dict of dataframes dataDict = { 'library': pd.read_csv( experimentName + '_librarytable.txt', sep='\t', header=0, index_col=0 ), 'counts': pd.read_csv( experimentName + '_mergedcountstable.txt', sep='\t', header=list(range(2)), index_col=list(range(1)) ), 'phenotypes': pd.read_csv( experimentName + '_phenotypetable.txt', sep='\t', header=list(range(2)), index_col=list(range(1)) ) } if premergedCounts: # add premerged counts dataDict['premerged counts'] = pd.read_csv( experimentName + '_rawcountstable.txt', sep='\t', header=list(range(3)), index_col=list(range(1)) ) if collapsedToTranscripts: # add transcript scores dataDict['transcript scores'] = pd.read_csv( experimentName + '_genetable.txt', sep='\t', header=list(range(3)), index_col=list(range(2)) ) dataDict['gene scores'] = pd.read_csv( experimentName + '_genetable_collapsed.txt', sep='\t', header=list(range(3)), index_col=list(range(1)) ) else: # add gene scores dataDict['gene scores'] = pd.read_csv( experimentName + '_genetable.txt', sep='\t', header=list(range(3)), index_col=list(range(1)) ) return dataDict
def _check_protospacer_length(library, protospacer_col): lengths = list(set(library[protospacer_col].str.len())) if len(lengths) > 1: raise ValueError(f"Protospacer lengths are not uniform: {lengths}") else: length = lengths[0] return length def _trim_protospacer(library, protospacer_col, trim_side, trim_len): if trim_side == '5prime': library[protospacer_col] = library[protospacer_col].str[trim_len:].str.upper() elif trim_side == '3prime': library[protospacer_col] = library[protospacer_col].str[:-trim_len].str.upper() return library def _write_screen_pkl(screen, name): """ Write AnnData object to a pickle file Parameters: screen (object): ScreenPro object to save name (str): name of the output file (.pkl extension will be added) """ file_name = f'{name}.pkl' with open(file_name, 'wb') as file: pickle.dump(screen, file) print(f'Object successfully saved to "{file_name}"') def _read_screen_pkl(name): """ Read ScreenPro object from a pickle file Parameters: name (str): name of the input file (.pkl extension will be added) """ file_name = f'{name}.pkl' with open(file_name, 'rb') as f: screen = pickle.load(f) return screen