# -*- coding: utf-8 -*-
"""
Functions to read StavroX processed crosslink data.
"""
import numpy as np
import pandas as pd
import os, re
if __name__ == '__main__':
import HelperFunctions as hf
else:
from . import HelperFunctions as hf
def _type_from_proteins(protein1_string, protein2_string):
"""
Decide whether its a mono, loop, intra or inter-protein link
based on the fields Protein 1 and Protein 2 of a StavroX results
table
Args:
protein1_string (str): A Protein1 string from StavroX
protein2_string (str): A Protein2 string from StavroX
Returns:
str: type of cross-link (mono, loop, intra, inter)
"""
if protein2_string == 'H2O':
return 'mono'
elif protein2_string == 'intrapeptidal':
return 'loop'
elif protein1_string == protein2_string:
return 'intra'
else:
return 'inter'
def _clear_protname(protname_string):
"""
Clear fasta-header and return the cleaned string. Also remove StavroX
specific protein2-terms like H20 and intrapeptidal
Args:
protname_string (str): A StavroX-Protein string
Returns:
str: cleaned protein string (my be empty string)
"""
if protname_string.startswith('>'):
protname_string = protname_string.split(' ')[0]
return protname_string[1:]
elif protname_string == 'H2O':
return ''
elif protname_string == 'intrapeptidal':
return ''
else:
return protname_string
def _clear_xlink(xlink_string):
"""
Removes all non-numerical characters from a string.
Args:
xlink_string (str): Contains the xlink position with the linked AA
Returns:
int: the xlink position without the amino acid label
"""
xlink_string = re.sub('\D+', '', xlink_string)
return int(xlink_string)
def _calc_xpos2(t, pos1, pos2, xlink2):
"""
Calculate the absolute position of a beta-peptide cross-link
based on what type the xlink is
Args:
t(str): type of the crosslink (mono, loop, inter/intra/homomultimeric)
pos1(int): absolute position of the first AA of the alpha-peptide
pos2(int): absolute position of the first AA of the beta-peptide
xlink2(int): relative position of the cross-link in the beta-peptide
Returns:
int: absolute position of the cross-link in the beta-peptide
"""
try:
if t == 'mono':
return np.nan
elif t == 'loop':
return int(pos1) + int(xlink2) - 1
else:
return int(pos2) + int(xlink2) - 1
except:
return np.nan
def _mods_and_sequences_from_peptides(peptide1, peptide2, mod_dict):
"""
Extract modification position and name from two StavroX peptide strings.
Set peptide2 to np.nan for monolinks and to peptide1 for loop-links
Args:
peptide1 (str): Entry of the StavroX Peptide 1 column
peptide2 (str): Entry of the StavroX Peptide 2 column
mod_dict (dict): dict mapping modification abbreviations to lists of [Modified AA, Modification name, Modification mass]
Returns:
list: name of the modification(s) of peptide1
list: position of the modification(s) of peptide1
list: mass(es) of the modification(s) of peptide1
list: name of the modification(s) of peptide2
list: position of the modification(s) of peptide2
list: mass(es) of the modification(s) of peptide2
"""
if str(peptide2) == '0': # mono-link
peptide2 = np.nan
(mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
_calculate_mod_modpos_modmass(peptide1), ([], [], [])
elif str(peptide2) == '1': # loop-link
(mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
_calculate_mod_modpos_modmass(peptide1, mod_dict), _calculate_mod_modpos_modmass(peptide1, mod_dict)
else: # regular inter-peptide link
(mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
_calculate_mod_modpos_modmass(peptide1, mod_dict), _calculate_mod_modpos_modmass(peptide2, mod_dict)
return mod1, modpos1, modmass1, sequence1, mod2, modpos2, modmass2, sequence2
def _calculate_mod_modpos_modmass(peptide_string, mod_dict):
"""
Extraction modification, name of the modification and modification mass
from a StavroX peptide string
Args:
peptide_string (str): Entry of the StavroX Peptide 1/2 column
mod_dict (dict): dict mapping modification abbreviations to lists of [Modified AA, Modification name, Modification mass]
Returns:
list: name of the modification(s)
list: position of the modification(s)
list: mass(es) of the modification(s)
str: sequence without modification(s)
"""
# if there is no peptide, there are no modifications
try:
a_float = float(peptide_string)
if np.isnan(a_float):
return [], [], []
else:
raise Exception('Unusual peptide string recognised: {}'.format(a_float))
except:
modpos = []
modmass= []
mod = []
sequence = ''
for idx, char in enumerate(peptide_string):
if char in mod_dict.keys():
# avoid setting N- and C-terminal ends as modification
mod.append(mod_dict[char][1])
modmass.append(mod_dict[char][2])
modpos.append(idx)
char = mod_dict[char][0]
if char in '[]{}':
continue
sequence += char
return mod, modpos, modmass, sequence
def _parse_ssf(ssf_file):
"""
Parses a StavroX properties.ssf file to extract modification identifiers
and masses
Args:
ssf_file (str): path to the properties.ssf file from StavroX
Returns:
dict: Dict mapping modification identifiers a list of [modified AA, name of modification, modmass]
"""
ssf_dict = {}
current_list = None
with open(ssf_file, 'r') as f:
line = f.readline()
while line:
if 'ELEMENTS' in line:
current_list = 'elements'
ssf_dict[current_list] = []
elif 'AMINOACIDS' in line:
current_list = 'AAs'
ssf_dict[current_list] = []
elif 'VARMODIFICATION' in line:
current_list = 'varmods'
ssf_dict[current_list] = []
elif 'STATMODIFICATION' in line:
current_list = 'fixedmods'
ssf_dict[current_list] = []
elif 'END' in line:
current_list = None
elif current_list != None:
ssf_dict[current_list].append(line.strip().split(';'))
line = f.readline()
elements2mass = dict((x, y) for x, y in ssf_dict['elements'])
AA2formula = dict((y,z) for x,y,z in ssf_dict['AAs'])
AA2name = dict((y,x) for x,y,z in ssf_dict['AAs'])
mod_dict = dict()
for x,y,z in ssf_dict['varmods']:
mod_dict[y] = [x,]
for x,y in ssf_dict['fixedmods']:
mod_dict[y] = [x,]
for symbol in mod_dict.keys():
modformula = AA2formula[symbol]
elements, stoichiometries = _elemental_composition(modformula, elements2mass)
modmass = 0
for element, amount in zip(elements, stoichiometries):
modmass += int(amount) * float(elements2mass[element])
unmodformula = AA2formula[mod_dict[symbol][0]]
elements, stoichiometries = _elemental_composition(unmodformula, elements2mass)
unmodmass = 0
for element, amount in zip(elements, stoichiometries):
unmodmass += int(amount) * float(elements2mass[element])
mod_dict[symbol].append(AA2name[symbol])
mod_dict[symbol].append(modmass-unmodmass)
#
# mod2mass = {}
# mod2name = {}
#
# for AA in AA2formula:
# if AA not in 'RHKDESTNQCUGPAVILMFYW':
# formula = AA2formula[AA]
# elements, stoichiometries = _elemental_composition(formula, elements2mass)
# mass = 0
# for element, amount in zip(elements, stoichiometries):
# mass += int(amount) * float(elements2mass[element])
#
# mod2mass[AA] = mass
# mod2name[AA] = AA2name[AA]
return mod_dict
def _elemental_composition(formula, elements2mass):
"""
Compute two lists with elements and amount of atoms from
a chemical cormula
Args:
formula (string): a chemical formula like C6H12N2O
elements2mass (dict): a dict mapping elements to mass
Returns:
elements: a list of the elements in the formula
stoichiometries: a list of the amounts of these elements
"""
elements = []
this_stoichiometrie = None
stoichiometries = []
collected_stoichiometry = True # set initial to true to avoid
# setting element 1 to 1
# formula is e.g. C6H12N2O
for char in formula:
# check if character is an element
if char in elements2mass.keys():
elements.append(char)
# only enter if this_stoichiometrie has been filled
if this_stoichiometrie:
# if stoichiometrie is not given as 1
stoichiometries.append(this_stoichiometrie)
this_stoichiometrie = ''
else:
this_stoichiometrie = ''
if collected_stoichiometry is False:
stoichiometries.append(1)
collected_stoichiometry = False
else:
this_stoichiometrie += char
collected_stoichiometry = True
if collected_stoichiometry is False:
stoichiometries.append(1)
else:
stoichiometries.append(this_stoichiometrie)
return elements, stoichiometries
[docs]def Read(stavrox_files, ssf_file, col_order=None, compact=False):
"""
Collect data from StavroX spectrum search and return an xtable data array.
Args:
stavrox_files: path or list of paths to StavroX output file(s)
ssf_file: properties.ssf to load modification IDs and masses
col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
compact (bool): Whether to compact the xTable to only those columns listed in col_order
Returns:
pandas.DataFrame: xtable data table
"""
# convert to list if the input is only a single path
if not isinstance(stavrox_files, list):
stavrox_files = [stavrox_files]
allData = list()
stavrox_dtypes = {'Scan number': str,
'Charge': 'int16',
'Protein 1 From': pd.Int64Dtype(),
'Protein 2 From': pd.Int64Dtype(),
'Protein 1': str,
'Protein 2': str,
'Peptide 1': str,
'Peptide 2': str,
'best linkage position peptide 1': str,
'best linkage position peptide 2': str,
'Score': float}
for file in stavrox_files:
print('Reading StavroX-file: {}'.format(file))
try:
with open(hf.compatible_path(file), 'r') as f:
firstline = f.readline()
headers = list()
for element in firstline.split(';'):
if element not in ['From', 'To']:
# there is a typo in the StavroX output files
if element == 'Peptide2':
headers.append('Peptide 2')
last_saved = 'Peptide 2'
else:
headers.append(element)
last_saved = element
else:
headers.append(last_saved + ' ' + element)
# the spectrum UUID column contains a semicolon delimiter! This messes up the whole csv reading
headers.insert(-1, 'Spectrum UUID2')
print(headers)
# Reassign the column headers to avoid duplicate From and To fields
s = pd.read_csv(hf.compatible_path(file),
delimiter=';',
header=0,
index_col = False,
dtype=stavrox_dtypes,
names = headers)
allData.append(s)
except:
raise Exception('[StavroX Read] Failed opening file: {}'.format(file))
xtable = pd.concat(allData)
### Process the data to comply to xTable format
xtable = xtable.rename(columns={'Protein 1 From': 'pos1',
'Protein 2 From': 'pos2',
'Score': 'score'
})
# the field Scan number contains the mgf file header. Use Regex to extract
# scan no
xtable[['rawfile', 'scanno', 'prec_ch']] = xtable['Scan number'].str.extract(hf.regexDict['mgfTITLE'])
print('[StavroX Read] Parsed MGF title')
# calculate the type of line (i.e. mono, loop, intra or inter)
xtable['type'] = np.vectorize(_type_from_proteins)(xtable['Protein 1'], xtable['Protein 2'])
print('[StavroX Read] inferred type')
# Set pos1 to 1 if Nterminal cross-link
xtable['pos1'] = xtable['pos1'].replace(0, 1)
xtable['pos2'] = xtable['pos2'].replace(0, 1)
print('[StavroX Read] changed xlink positions')
# remove for example preceding > in UniProt headers
xtable['prot1'] = xtable['Protein 1'].apply(_clear_protname)
xtable['prot2'] = xtable['Protein 2'].apply(_clear_protname)
print('[StavroX Read] Cleared Protein names')
# Best linkage position also contains the linked AA (that is already given
# by sequence and link position)
# --> xtract only the numerical part
# StavroX treats the N-terminus as 0th position --> replace by 1
xtable['xlink1'] = xtable['best linkage position peptide 1'].apply(_clear_xlink).replace(0, 1)
xtable['xlink2'] = xtable['best linkage position peptide 2'].apply(_clear_xlink).replace(0, 1)
print('[StavroX Read] Found xlink position')
# calculate absolute position of xlink as sum of start of peptide
# and relative position of the xlink
xtable['xpos1'] = xtable['xlink1'].astype(int) + xtable['pos1'].astype(int) - 1
# xpos2 has to be calculated separately for inter/intra, loop and mono-peptides
xtable['xpos2'] =\
np.vectorize(_calc_xpos2)(xtable['type'], xtable['pos1'], xtable['pos2'], xtable['xlink2'])
print('[StavroX Read] Generated xpos')
mod_dict = _parse_ssf(hf.compatible_path(ssf_file))
print('[StavroX Read] parsed SSF')
# Extract the modification mass and position from the peptide string
xtable[['mod1', 'modpos1', 'modmass1', 'pepseq1', 'mod2', 'modpos2', 'modmass2', 'pepseq2']] =\
pd.DataFrame(xtable[['Peptide 1', 'Peptide 2']].apply(\
lambda row: _mods_and_sequences_from_peptides(row['Peptide 1'],
row['Peptide 2'],
mod_dict),
axis=1).tolist(), index=xtable.index)
print('[StavroX Read] Extracted modifications and sequences')
# generate an ID for every crosslink position within the protein(s)
xtable['ID'] =\
pd.Series(np.vectorize(hf.generate_id,
otypes=['object'])(xtable['type'],
xtable['prot1'],
xtable['xpos1'],
xtable['prot2'],
xtable['xpos2']),
index=xtable.index).replace('nan', np.nan)
print('[StavroX Read] Generated ID')
# Stavrox does not run on isotope-labeled xlinkers
xtable['xtype'] = np.nan
if len(xtable[xtable['type'] == 'inter']) > 0:
# Reassign the type for inter xlink to inter/intra/homomultimeric
onlyInter = xtable['type'] == 'inter'
xtable.loc[onlyInter, 'type'] =\
np.vectorize(hf.categorize_inter_peptides)(xtable[onlyInter]['prot1'],
xtable[onlyInter]['pos1'],
xtable[onlyInter]['pepseq1'],
xtable[onlyInter]['prot2'],
xtable[onlyInter]['pos2'],
xtable[onlyInter]['pepseq1'])
print('[StavroX Read] categorized inter peptides')
else:
print('[StavroX Read] skipped inter peptide categorization')
# StavroX already filters decoys
xtable['decoy'] = False
# reassign dtypes for every element in the df
# errors ignore leaves the dtype as object for every
# non-numeric element
xtable = xtable.apply(pd.to_numeric, errors = 'ignore')
xtable['search_engine'] = 'StavroX'
xtable = hf.order_columns(xtable, col_order, compact)
return xtable
if __name__ == '__main__':
# defines the column headers required for xtable output
col_order = [ 'rawfile', 'scanno', 'prec_ch',
'pepseq1', 'xlink1',
'pepseq2', 'xlink2', 'xtype',
'modmass1', 'modpos1', 'mod1',
'modmass2', 'modpos2', 'mod2',
'prot1', 'xpos1', 'prot2',
'xpos2',
'type', 'score',
'ID',
'pos1', 'pos2', 'decoy']
stavrox_files = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\stavrox\20180615_KS_CL_9_pXtract.csv'
ssf_file = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\stavrox\StavroxSettings.ssf'
xtable = Read(stavrox_files, ssf_file, col_order=col_order, compact=False)