Source code for croco.Kojak

# -*- coding: utf-8 -*-

"""
Functions to read and process data generated with the Kojak cross-link
search engine.
"""

import numpy as np
import pandas as pd

if __name__ == '__main__':
    import HelperFunctions as hf
    import KojakFunctions as kj
else:
    from . import HelperFunctions as hf
    from . import KojakFunctions as kj

[docs]def Read(kojak_files, rawfile=None, decoy_string='decoy', col_order=None, compact=False): """ Read Kojak results file, calculate and process missing values required for xTable and return the xTable. Args: kojak_files (list): path or paths to Kojak results file(s) rawfile (str): name of the corresponding rawfile decoy_string (optional): string used in kojak to label decoys col_order (list) – List of xTable column titles that are used to sort and compress the resulting datatable compact (bool): Compact the xTable to only the columns given in col_order or not Returns: pandas.DataFrame: xtable data table """ # convert to list if the input is only a single path if not isinstance(kojak_files, list): kojak_files = [kojak_files] allData = list() kojak_dtypes = {'Scan Number': pd.Int64Dtype(), 'Charge': pd.Int64Dtype(), 'Link #1': pd.Int64Dtype(), 'Link #2': pd.Int64Dtype(), 'Score': float } for file in kojak_files: print('Reading Kojak-file: ' + file) # only called if kojak_file is not None try: s = pd.read_csv(hf.compatible_path(file), skiprows = 1, # skip the Kojak version dtype=kojak_dtypes, na_values = '-', delimiter='\t') allData.append(s) except Exception as e: raise Exception('[xTable Read] Failed opening file: {}'.format(file)) xtable = pd.concat(allData) # remove lines containing non-identified PSMs (marked with '-' in both # Link columns xtable.dropna(axis=0, how='all', subset=['Link #1', 'Link #2'], inplace=True) # dropping lines causes fragmented index --> regenate the index xtable.reset_index(drop=True, inplace=True) # if split into mulitple rows if multiple candidate proteins were found to # match an experimental spectrum xtable = hf.split_concatenated_lists(xtable, where=['Protein #1', 'Protein #2']) ### Process the data to comply to xTable format xtable = xtable.rename(columns={'Scan Number': 'scanno', 'Charge': 'prec_ch', 'Link #1': 'xlink1', 'Link #2': 'xlink2', 'Score': 'score' }) # Extract peptide sequence, modification mass and position from the # Peptide #1 and Peptide #2 entries xtable = kj.extract_peptide(xtable) # transform unset xlinks to np.nan xtable[['xlink1', 'xlink2']] = xtable[['xlink1', 'xlink2']].replace(-1, np.nan) # extract protein name and relative cross-link position from the Protein # # entries xtable = kj.extract_protein(xtable) # calculate absolute position of first AA of peptide # ignoring errors avoids raising error in case on NaN -> returns NaN # as pos # Must be calculated as float as NaN is not implemented in int xtable['pos1'] =\ xtable['xpos1'].astype(float, errors='ignore') - \ xtable['xlink1'].astype(float, errors='ignore') + 1 xtable['pos2'] =\ xtable['xpos2'].astype(float, errors='ignore') - \ xtable['xlink2'].astype(float, errors='ignore') + 1 # Calculate if a cross link is of inter or of loop type # Refine the inter type into inter/intra/homomultimeric # Generate ID for the xlinks xtable = kj.assign_ID_and_type(xtable) #sets the column decoy based on whether the decoy string is present in the # protein name or not xtable = kj.set_decoy(xtable, decoy_string) # set the rawfile name for xtable (None if not provided by call) xtable['rawfile'] = rawfile xtable['xtype'] = np.nan xtable['search_engine'] = 'Kojak' xtable = hf.order_columns(xtable, col_order, compact) ### return xtable df return xtable
if __name__ == '__main__': kojak_file = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\input\kojak\20180615_KS_CL_9_msconvert.kojak.txt' col_order = ['rawfile', 'scanno', 'prec_ch', 'pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype', 'modmass1', 'modpos1', 'mod1', 'modmass2', 'modpos2', 'mod2', 'prot1', 'xpos1', 'prot2', 'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy'] xtable = Read(kojak_file, col_order=col_order, rawfile='20180615_KS_CL_9')