Source code for croco.KojakPercolator

# -*- coding: utf-8 -*-

"""
Functions to read Percolator processed Kojak data.
"""

import numpy as np
import pandas as pd

if __name__ == '__main__':
    import HelperFunctions as hf
    import KojakFunctions as kj
else:
    from . import HelperFunctions as hf
    from . import KojakFunctions as kj

[docs]def Read(perc_files, rawfile=None, validated_string='.validated', percolator_string='.perc', decoy_string='decoy', compact=False, col_order=None): """ Collects unprocessed and percolated results and returns an xtable data array. Args: perc_file (str): path or list of paths to percolated Kojak file(s) validated_string (str): user-defined string appended to the percolated filenames percolator_string (str): user-defined string appended to the file prepared for percolating decoy_string (optional): string used in kojak to label decoys rawfile (str): name of the corresponding rawfile col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable compact (bool): Whether to compact the xTable to only those columns listed in col_order Returns: pandas.DataFrame: xtable data table """ # convert to list if the input is only a single path if not isinstance(perc_files, list): perc_files = [perc_files] allData = list() kojak_dtypes = {'Scan Number': pd.Int64Dtype(), 'Charge': pd.Int64Dtype(), 'Link #1': pd.Int64Dtype(), 'Link #2': pd.Int64Dtype(), 'Score': float } for p_file in perc_files: ### Collect data and convert to pandas format print('[Kojak Perc Read] Reading Percolator-file: ' + p_file) try: percolated = pd.read_csv(hf.compatible_path(p_file), delimiter='\t', usecols=range(5), index_col=False, # avoid taking the first col as index engine='python') if len(percolated) == 0: raise Exception("The file {} seems to be empty and cannot be converted".format(p_file)) except FileNotFoundError: raise Exception("Could not find the percolated file %s." % p_file) percolated.rename(columns={'PSMId': 'SpecId'}, inplace=True) unperc_file = p_file.replace(validated_string, '') print('[Kojak Perc Read] Reading Percolator input: ' + unperc_file) try: unpercolated = pd.read_csv(hf.compatible_path(unperc_file), delimiter = '\t', usecols=range(10), engine='python', index_col=False) except FileNotFoundError: raise Exception("Could not find the unpercolated file %s. Please move it into the same directory as the percolator files!" % unperc_file) # Merge with left join (only keys that are in tje percolated DF will be re- # tained) xtable = pd.merge(percolated, unpercolated, on='SpecId', how='left') xtable = xtable.rename(columns={'score': 'percolator_score'}) # Reading the Kojak-file is required to get additional information on the # matches such as the corresponding protein names kojak_file = unperc_file[0:unperc_file.find(percolator_string)] + '.kojak.txt' print('Reading Kojak-file: ' + kojak_file) try: kojak = pd.read_csv(hf.compatible_path(kojak_file), skiprows = 1, # skip the Kojak version dtype=kojak_dtypes, na_values='-', delimiter='\t') except FileNotFoundError: raise Exception("Could not find the kojak_file %s. Please move it into the same directory as the percolator files!" % kojak_file) kojak.rename(columns={'Scan Number': 'scannr'}, inplace=True) s = pd.merge(xtable, kojak, on=['scannr', 'Charge', 'dScore', 'Score'], how='left') allData.append(s) xtable = pd.concat(allData, sort=False) # split ambiguous concatenated protein names xtable = hf.split_concatenated_lists(xtable, where=['Protein #1', 'Protein #2']) print('[Kojak Perc Read] Splitted concatenated lists') ### Process the data to comply to xTable format xtable = xtable.rename(columns={'scannr': 'scanno', 'Charge': 'prec_ch', 'Link #1': 'xlink1', 'Link #2': 'xlink2', 'Score': 'score' }) print('[Kojak Perc Read] Renamed columns') # Extract peptide sequence, modification mass and position from the # Peptide #1 and Peptide #2 entries xtable = kj.extract_peptide(xtable) print('[Kojak Perc Read] Extracted peptides') # transform unset xlinks to np.nan xtable[['xlink1', 'xlink2']] = xtable[['xlink1', 'xlink2']].replace(-1, np.nan) # extract protein name and relative cross-link position from the Protein # # entries xtable = kj.extract_protein(xtable) print('[Kojak Perc Read] Extracted Proteins') # calculate absolute position of first AA of peptide # ignoring errors avoids raising error in case on NaN -> returns NaN # as pos # Must be calculated as float as NaN is not implemented in int xtable['pos1'] =\ xtable['xpos1'].astype(float, errors='ignore') - \ xtable['xlink1'].astype(float, errors='ignore') + 1 xtable['pos2'] =\ xtable['xpos2'].astype(float, errors='ignore') - \ xtable['xlink2'].astype(float, errors='ignore') + 1 # Calculate if a cross link is of inter or of loop type # Refine the inter type into inter/intra/homomultimeric # Generate ID for the xlinks xtable = kj.assign_ID_and_type(xtable) print('[Kojak Perc Read] Calculated Positions and assigned IDs') #sets the column decoy based on whether the decoy string is present in the # protein name or not xtable = kj.set_decoy(xtable, decoy_string) # set the rawfile name for xtable (None if not provided by call) xtable['rawfile'] = rawfile xtable['xtype'] = np.nan xtable['search_engine'] = 'Kojak and Percolator' xtable = hf.order_columns(xtable, col_order, compact) return xtable
if __name__ == '__main__': # defines the column headers required for xtable output col_order = [ 'rawfile', 'scanno', 'prec_ch', 'pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype', 'modmass1', 'modpos1', 'mod1', 'modmass2', 'modpos2', 'mod2', 'prot1', 'xpos1', 'prot2', 'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy'] perc_file = [r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.intra.validated.txt', #r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.loop.validated.txt', r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.single.validated.txt'] xtable = Read(perc_file)