Source code for croco.KojakPercolator

# -*- coding: utf-8 -*-

"""
Functions to read Percolator processed Kojak data.
"""

import numpy as np
import pandas as pd

if __name__ == '__main__':
    import HelperFunctions as hf
    import KojakFunctions as kj
else:
    from . import HelperFunctions as hf
    from . import KojakFunctions as kj

[docs]def Read(perc_files, rawfile=None, validated_string='.validated', percolator_string='.perc', decoy_string='decoy', compact=False, col_order=None):
    """
    Collects unprocessed and percolated results and returns an xtable data array.

    Args:
        perc_file (str): path or list of paths to percolated Kojak file(s)
        validated_string (str): user-defined string appended to the percolated filenames
        percolator_string (str): user-defined string appended to the file prepared for percolating
        decoy_string (optional): string used in kojak to label decoys
        rawfile (str): name of the corresponding rawfile
        col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
        compact (bool): Whether to compact the xTable to only those columns listed in col_order

    Returns:
        pandas.DataFrame: xtable data table
    """
    # convert to list if the input is only a single path
    if not isinstance(perc_files, list):
        perc_files = [perc_files]
    
    allData = list()

    kojak_dtypes = {'Scan Number': pd.Int64Dtype(),
                    'Charge': pd.Int64Dtype(),
                    'Link #1': pd.Int64Dtype(),
                    'Link #2': pd.Int64Dtype(),
                    'Score': float
                    }

    for p_file in perc_files:
        ### Collect data and convert to pandas format
    
        print('[Kojak Perc Read] Reading Percolator-file: ' + p_file)
    
        try:
            percolated = pd.read_csv(hf.compatible_path(p_file),
                                     delimiter='\t',
                                     usecols=range(5),
                                     index_col=False, # avoid taking the first col as index
                                     engine='python')
            
            if len(percolated) == 0:
                raise Exception("The file {} seems to be empty and cannot be converted".format(p_file))
            
        except FileNotFoundError:
            raise Exception("Could not find the percolated file %s." % p_file)
    
        percolated.rename(columns={'PSMId': 'SpecId'}, inplace=True)
               
        unperc_file = p_file.replace(validated_string, '')
    
        print('[Kojak Perc Read] Reading Percolator input: ' + unperc_file)
    
        try:
            unpercolated = pd.read_csv(hf.compatible_path(unperc_file),
                                      delimiter = '\t',
                                      usecols=range(10),
                                      engine='python',
                                      index_col=False)
        except FileNotFoundError:
            raise Exception("Could not find the unpercolated file %s. Please move it into the same directory as the percolator files!" % unperc_file)
    
        # Merge with left join (only keys that are in tje percolated DF will be re-
        # tained)
        xtable = pd.merge(percolated, unpercolated, on='SpecId', how='left')

        xtable = xtable.rename(columns={'score': 'percolator_score'})

        # Reading the Kojak-file is required to get additional information on the
        # matches such as the corresponding protein names
        kojak_file = unperc_file[0:unperc_file.find(percolator_string)] + '.kojak.txt'
    
        print('Reading Kojak-file: ' + kojak_file)
    
        try:
            kojak = pd.read_csv(hf.compatible_path(kojak_file),
                                skiprows = 1, # skip the Kojak version
                                dtype=kojak_dtypes,
                                na_values='-',
                                delimiter='\t')
        except FileNotFoundError:
            raise Exception("Could not find the kojak_file %s. Please move it into the same directory as the percolator files!" % kojak_file)
    
        kojak.rename(columns={'Scan Number': 'scannr'}, inplace=True)
    
        s = pd.merge(xtable, kojak, on=['scannr', 'Charge', 'dScore', 'Score'], how='left')
        
        allData.append(s)

    xtable = pd.concat(allData, sort=False)

    # split ambiguous concatenated protein names
    xtable = hf.split_concatenated_lists(xtable, where=['Protein #1', 'Protein #2'])

    print('[Kojak Perc Read] Splitted concatenated lists')

    ### Process the data to comply to xTable format
    xtable = xtable.rename(columns={'scannr': 'scanno',
                                    'Charge': 'prec_ch',
                                    'Link #1': 'xlink1',
                                    'Link #2': 'xlink2',
                                    'Score': 'score'
                                    })

    print('[Kojak Perc Read] Renamed columns')
    
    # Extract peptide sequence, modification mass and position from the
    # Peptide #1 and Peptide #2 entries
    xtable = kj.extract_peptide(xtable)

    print('[Kojak Perc Read] Extracted peptides')

    # transform unset xlinks to np.nan
    xtable[['xlink1', 'xlink2']] = xtable[['xlink1', 'xlink2']].replace(-1, np.nan)

    # extract protein name and relative cross-link position from the Protein #
    # entries
    xtable = kj.extract_protein(xtable)

    print('[Kojak Perc Read] Extracted Proteins')

    # calculate absolute position of first AA of peptide
    # ignoring errors avoids raising error in case on NaN -> returns NaN
    # as pos
    # Must be calculated as float as NaN is not implemented in int
    xtable['pos1'] =\
        xtable['xpos1'].astype(float, errors='ignore') - \
        xtable['xlink1'].astype(float, errors='ignore') + 1
    xtable['pos2'] =\
        xtable['xpos2'].astype(float, errors='ignore') - \
        xtable['xlink2'].astype(float, errors='ignore') + 1

    # Calculate if a cross link is of inter or of loop type
    # Refine the inter type into inter/intra/homomultimeric
    # Generate ID for the xlinks
    xtable = kj.assign_ID_and_type(xtable)

    print('[Kojak Perc Read] Calculated Positions and assigned IDs')

    #sets the column decoy based on whether the decoy string is present in the
    # protein name or not
    xtable = kj.set_decoy(xtable, decoy_string)

    # set the rawfile name for xtable (None if not provided by call)
    xtable['rawfile'] = rawfile

    xtable['xtype'] = np.nan

    xtable['search_engine'] = 'Kojak and Percolator'

    xtable = hf.order_columns(xtable, col_order, compact)
    
    return xtable

if __name__ == '__main__':
    # defines the column headers required for xtable output
    col_order = [ 'rawfile', 'scanno', 'prec_ch',
                  'pepseq1', 'xlink1',
                  'pepseq2', 'xlink2', 'xtype',
                  'modmass1', 'modpos1', 'mod1',
                  'modmass2', 'modpos2', 'mod2',
                  'prot1', 'xpos1', 'prot2',
                  'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']

    perc_file = [r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.intra.validated.txt',
                 #r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.loop.validated.txt',
                 r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\kojak_perc\20180615_KS_CL_9_msconvert.perc.single.validated.txt']

    xtable = Read(perc_file)