Source code for croco.KojakFunctions

# -*- coding: utf-8 -*-
"""
Functions that are collectively used by croco.Kojak and croco.KojakPercolator.
"""

import re
import numpy as np
import pandas as pd


if __name__ == '__main__' or __name__ == 'KojakFunctions':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf

[docs]def extract_peptide(xtable): """ Extract peptide sequence, modification mass and position from the Peptide #1 and Peptide #2 entries Args: xtable (pandas.DataFrame): xTable data structure with "Peptide #1" and "Peptide #2" columns Returns: pandas.DataFrame: xTable with modmass, modpos, pepseq and mod """ pep1notNull = xtable['Peptide #1'].notnull() pep2notNull = xtable['Peptide #2'].notnull() # the index corresponds to the index of the slice of the dataframe # as original row numbers are retained during conversion, values can directly # be inserted at the right row xtable[['modmass1', 'modpos1', 'pepseq1']] =\ pd.DataFrame(xtable.loc[pep1notNull, 'Peptide #1'].apply(process_kojak_peptide).tolist(), index=xtable.loc[pep1notNull, 'Peptide #1'].index) if sum(pep2notNull) > 0: xtable[['modmass2', 'modpos2', 'pepseq2']] =\ pd.DataFrame(xtable.loc[pep2notNull, 'Peptide #2'].apply(process_kojak_peptide).tolist(), index=xtable.loc[pep2notNull, 'Peptide #2'].index) else: xtable['modmass2'] = np.nan xtable['modpos2'] = np.nan xtable['pepseq2'] = np.nan # use the modification masses as labels xtable['mod1'] = xtable['modmass1'].apply(lambda x: x if hf.isnan(x) else [str(y) for y in x]) xtable['mod2'] = xtable['modmass2'].apply(lambda x: x if hf.isnan(x) else [str(y) for y in x]) return xtable
[docs]def extract_protein(xtable): """ Extract protein name and relative cross-link position from the Protein # entries Args: xtable (pandas.DataFrame): xTable data structure with "Protein #1", "Protein #2", xpos1, xlink1, and xlink2 columns Returns: pandas.DataFrame: xTable with prot and xpos """ xtable[['prot1', 'xpos1']] =\ xtable['Protein #1'].str.extract(r'^(\w+)(?:\((.*?)\))?;$') xtable.loc[xtable['xpos1'].notnull(), 'xpos1'] =\ xtable.loc[xtable['xpos1'].notnull(), 'xpos1'].astype(int) xtable['xpos1'] = xtable['xpos1'].astype(pd.Int64Dtype()) xtable[['prot2', 'xpos2']] =\ xtable['Protein #2'].str.extract(r'^(\w+)(?:\((.*?)\))?;$') xtable.loc[xtable['xpos2'].notnull(), 'xpos2'] =\ xtable.loc[xtable['xpos2'].notnull(), 'xpos2'].astype(int) xtable['xpos2'] = xtable['xpos2'].astype(pd.Int64Dtype()) # xpos2 for loop links is not directly stored but can be inferred # from xpos1 and xlink2 isLoopLink = xtable['prot1'].notnull() & xtable['prot2'].isnull() xtable.loc[isLoopLink, 'xpos2'] = xtable.loc[isLoopLink, ['xpos1', 'xlink2']].sum(axis=1).astype(pd.Int64Dtype()) - xtable.loc[isLoopLink, 'xlink1'] return xtable
[docs]def assign_ID_and_type(xtable): """ Calculate if a cross link is of inter or of loop type Refine the inter type into inter/intra/homomultimeric Generate ID for the xlinks Args: xtable (pandas.DataFrame): Table data structure with "prot", "pos", "pepseq" Returns: pandas.DataFrame: xTable with type and ID """ # assign cateogries of cross-links based on identification of prot1 and prot2 xtable.loc[xtable['prot2'].notnull(), 'type'] = 'inter' xtable.loc[xtable['prot2'].isnull() & xtable['xlink2'].notnull(), 'type'] = 'loop' # Kojak does not generate monolinked peptides but peptides modified # with the hydrolysed xlinker mass xtable.loc[xtable['xlink1'].isnull() & xtable['xlink2'].isnull(), 'type'] = 'linear or mono' # Reassign the type for inter xlink to inter/intra/homomultimeric isInterLink = xtable['type'] == 'inter' # only perform if the selection is not all false if sum(isInterLink) > 0: xtable.loc[isInterLink, 'type'] =\ np.vectorize(hf.categorize_inter_peptides)(xtable[isInterLink]['prot1'], xtable[isInterLink]['pos1'], xtable[isInterLink]['pepseq1'], xtable[isInterLink]['prot2'], xtable[isInterLink]['pos2'], xtable[isInterLink]['pepseq2']) # only apply the operation requiring at least prot1 and xpos1 to those # lines that are loop, intra or interlinks type_identified = xtable['type'].notna() # generate an ID for every crosslink position within the protein(s) xtable.loc[type_identified, 'ID'] =\ pd.Series(np.vectorize(hf.generate_id, otypes=['object'])(xtable['type'], xtable['prot1'], xtable['xpos1'], xtable['prot2'], xtable['xpos2']), index=xtable.index).replace('nan', np.nan) return xtable
[docs]def set_decoy(xtable, decoy_string): """ sets the column decoy based on whether the decoy string is present in the protein name or not Args: xtable (pandas.DataFrame): xTable with "prot" columns titles decoy_string (str): Kojak decoy string Returns: pandas.DataFrame: xTable with decoy column """ # Check if all prot2 are null (may be in only loop dfs) if xtable['prot2'].isnull().all(): xtable['decoy'] = np.where(xtable['prot1'].str.contains(decoy_string), True, False) else: # set a decoy indicator where at least one protein is reversed xtable['decoy'] = np.where(xtable['prot1'].str.contains(decoy_string) |\ xtable['prot2'].str.contains(decoy_string), True, False) return xtable
[docs]def process_kojak_peptide(peptide_string): """ Return Modifications, their localisation and the peptide sequence from a Kojak sequence string such as M[15.99]TDSKYFTTNK. If modifications are found, two lists with modification masses, positions and the raw peptide sequence are returned. If no modififications are found within a peptide string, the function returns np.nan, np.nan and the sequence. Args: peptide_string (str): a Kojak peptide string Returns: list of float or np.nan: list of modification masses list of int or np.nan: list of modification positions within the peptide str: peptide sequence without modifications """ modmasses = [] sequence = '' modposns = [] is_mod = False posInStr = 0 for char in peptide_string: if char == '[': is_mod = True theMod = '' elif char == ']': is_mod = False modmasses.append(float(theMod)) modposns.append(int(posInStr)) elif is_mod == False: if char.isalpha(): sequence += char posInStr += 1 else: theMod += char if modmasses == []: modmasses = np.nan return modmasses, modposns, sequence
#def process_kojak_protein(protein_string): # """ # Return protein name and absolute cross-link position from # a kojak string such as # sp|P07340|AT1B1_RAT Sodium/potassium-transporting ATPase subunit beta-1 OS=Rattus norvegicus GN=Atp1(13); # # Args: # protein_string(str): a kojak protein string # # Returns: # str or np.nan: protein name # int or np.nan: position # """ # # RE: group1: everything until the first (lazy) brackets # # group2 (optional) everything inside the brackets # pattern = re.compile('^([^\(]+?)(?:\((\d*)\))?;') # if pattern.match(protein_string): # match = pattern.match(protein_string) # prot, xpos = match.groups() # if xpos == None: # re.match returns None (not NaN) if a substring doesnt match # return prot, np.nan # else: # return prot, int(xpos) # else: # return np.nan, np.nan