# -*- coding: utf-8 -*-
"""
Functions to read Xi processed crosslink data filtered with xiFDR.
This script is part of the CroCo cross-link converter project
"""
import numpy as np
import pandas as pd
import os, re
from collections import defaultdict
if __name__ == '__main__':
import HelperFunctions as hf
import Xi as xi
else:
from . import Xi as xi
from . import HelperFunctions as hf
def _assign_type(row):
"""
Assign mono, loop, inter and intra link
based on prot1, prot2, xlink1 and xlink2 entries
Args:
row (Series): a series or list containing prot1, prot2, xlink1, xlink2
Returns:
str or np.nan: type of cross-link (inter, intra, loop, mono)
"""
prot1, prot2, xlink1, xlink2 = row
prot1 = str(prot1)
prot2 = str(prot2)
xlink1 = str(xlink1)
xlink2 = str(xlink2)
if prot2 != 'nan' and prot1 == prot2:
t = 'intra'
elif prot2 != 'nan':
t = 'inter'
elif prot2 == 'nan' and xlink2 != 'nan':
t = 'loop'
elif prot1 != 'nan' and prot2 == 'nan' and xlink1 != 'nan':
t = 'mono'
else:
t = np.nan
return t
def _modifications_from_sequence(sequence, moddict):
"""
Extract a modification name and its position from a sequence containing
the modifications as lowercase characters
Args:
sequence (str): a sequence to be parsed
moddict (dict): a dictionary mapping symbols for modified amino acids to tuples of corresponding amino acids and their masses
Returns:
str: sequence without the modification characters
list of str: modification names
list of int: modification positions within the peptide
"""
mods = []
modposns = []
modmasses = []
# list containign the symbal, start position, end position of the
# symbol in the original sequence
found = list()
for symbol in moddict.keys():
if symbol in sequence:
for m in re.finditer(symbol, sequence):
found.append((symbol, m.start(), m.end()))
sequence = re.sub(symbol, moddict[symbol][0], sequence)
# sort in place
found.sort(key=lambda x: x[1])
bias = 0
for match in found:
mods.append(match[0])
modmasses.append(moddict[match[0]][1])
modposns.append(1+match[1]-bias)
bias += len(match[0]) - len(moddict[match[0]][0])
return sequence, mods, modposns, modmasses
def _mods_from_xi_config(xi_config):
"""
Extract a modifications dictionary from a xi config file
Args:
xi_config (str): path to xi_config file
Returns:
dict: dictionary mapping modification symbols to a list of their unmodified counterpart and the modified mass
"""
moddict = dict()
modification_w_mass_pattern = re.compile(r'modification:\w+::SYMBOL:(\w+);MODIFIED:(\w+);MASS:(\d+(?:\.\d+))')
modification_w_deltamass_pattern = re.compile(r'modification:\w+::SYMBOLEXT:(\w+);MODIFIED:([\w,]+);DELTAMASS:(\d+(?:\.\d+))')
# dict with aa masses to calculate exact masses from deltamasses
aa2mass = {'G': 57.02147,
'A': 71.03712,
'S': 87.03203,
'P': 97.05277,
'V': 99.06842,
'T': 101.04768,
'C': 103.00919,
'I': 113.08407,
'L': 113.08407,
'N': 114.04293,
'D': 115.02695,
'Q': 128.05858,
'K': 128.09497,
'E': 129.0426,
'M': 131.04049,
'H': 137.05891,
'F': 147.06842,
'R': 156.10112,
'Y': 163.06333,
'W': 186.07932}
with open(xi_config) as xcfg:
for line in xcfg.readlines():
if line.startswith('#'):
continue
# amino acids can contain modifications
if line.startswith('modification:'):
# does this line look like a modification line with exact mass
if modification_w_mass_pattern.match(line):
print('Found mass instead of deltamass')
match = modification_w_mass_pattern.match(line)
symbol, aa, mass = match.groups()
# calculate the deltamass based on amino acids masses
deltamass = float(mass) - aa2mass[aa]
moddict[symbol] = aa, float(deltamass)
# or does it look like a deltamass modline
elif modification_w_deltamass_pattern.match(line):
match = modification_w_deltamass_pattern.match(line)
symbolext, aas, deltamass = match.groups()
# deltamass lines can contain multiple aminoacids to be modified
for aa in aas.split(','):
symbol = aa + symbolext
moddict[symbol] = aa, float(deltamass)
# or monolinked cross-linkers
elif line.startswith('crosslinker:SymetricSingleAminoAcidRestrictedCrossLinker:'):
# at pos 57 the string "crosslinker:SymetricSingleAminoAcidRestrictedCrossLinker:" ends
for element in line[57:].split(';'):
element_key, element_val = element.split(':')
if element_key == 'Name':
xl_name = element_val
elif element_key == 'MASS':
mass = float(element_val)
elif element_key == 'LINKEDAMINOACIDS':
aas = element_val
elif element_key == 'MODIFICATIONS':
mod_symbols = list()
mod_masses = list()
for i, e in enumerate(element_val.split(',')):
if i%2 == 0: #even indices
mod_symbols.append(e)
else:
mod_masses.append(e)
for aa in aas.split(','):
for idx, symbolext in enumerate(mod_symbols):
symbol = aa + xl_name.lower() + symbolext.lower()
moddict[symbol] = aa, float(mass) + float(mod_masses[idx])
return moddict
[docs]def Read(xifdr_files, xi_config, col_order=None, compact=False):
"""
Collects data from Xi spectrum search filtered by xiFDR and returns an xtable data array.
Args:
xifdr_files: path or list of paths to xiFDR file(s)
xi_config: path to corresponding xi_config file
col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
compact (bool): Whether to keep the columns of the original dataframe or not
Returns:
xtable: xtable data table
"""
# convert to list if the input is only a single path
if not isinstance(xifdr_files, list):
xifdr_files = [xifdr_files]
allData = list()
xifdr_dtypes = {'scan': pd.Int64Dtype(),
'exp charge': pd.Int64Dtype(),
'PepSeq1': str,
'PepSeq2': str,
'LinkPos1': pd.Int16Dtype(),
'LinkPos2': pd.Int16Dtype(),
'Protein1': str,
'Protein2': str,
'ProteinLinkPos1': pd.Int16Dtype(),
'ProteinLinkPos2': pd.Int16Dtype(),
'PepPos1': pd.Int16Dtype(),
'PepPos2': pd.Int16Dtype(),
}
for file in xifdr_files:
print('Reading xiFDR-file: {}'.format(file))
try:
s = pd.read_csv(hf.compatible_path(file), delimiter=',', dtype=xifdr_dtypes)
allData.append(s)
except:
raise Exception('[xTable Read] Failed opening file: {}'.format(file))
xtable = pd.concat(allData)
### Process the data to comply to xTable format
xtable = xtable.rename(columns={#rawfile
'exp charge': 'prec_ch',
'LinkPos1': 'xlink1',
'LinkPos2': 'xlink2',
'Protein1': 'prot1',
'ProteinLinkPos1': 'xpos1',
'Protein2': 'prot2',
'ProteinLinkPos2': 'xpos2',
'PepPos1': 'pos1',
'PepPos2': 'pos2',
'Score': 'score'
})
# split the run column from Xi into two columns: rawfile and scanno
xtable['rawfile'], xtable['scanno'] = xtable['run'].str.split('.', 1).str
moddict = _mods_from_xi_config(xi_config)
# Extract clean sequence and modificiations from the sequence string
xtable[['pepseq1', 'mod1', 'modpos1', 'modmass1']] =\
pd.DataFrame(xtable['PepSeq1'].apply(lambda x: _modifications_from_sequence(x, moddict)).tolist(), index=xtable.index)
xtable[['pepseq2', 'mod2', 'modpos2', 'modmass2']] =\
pd.DataFrame(xtable['PepSeq2'].apply(lambda x: _modifications_from_sequence(x, moddict)).tolist(), index=xtable.index)
# assign cateogries of cross-links based on identification of prot1 and prot2
xtable['type'] = xtable[['prot1', 'prot2', 'xlink1', 'xlink2']].apply(\
_assign_type, axis=1)
if len(xtable[xtable['type'] == 'inter']) > 0:
# Reassign the type for inter xlink to inter/intra/homomultimeric
onlyInter = xtable['type'] == 'inter'
xtable.loc[onlyInter, 'type'] =\
np.vectorize(hf.categorize_inter_peptides)(xtable[onlyInter]['prot1'],
xtable[onlyInter]['pos1'],
xtable[onlyInter]['pepseq1'],
xtable[onlyInter]['prot2'],
xtable[onlyInter]['pos2'],
xtable[onlyInter]['pepseq1'])
print('[xiFDR Read] categorized inter peptides')
else:
print('[xiFDR Read] skipped inter peptide categorization')
# generate an ID for every crosslink position within the protein(s)
xtable['ID'] =\
pd.Series(np.vectorize(hf.generate_id,
otypes=['object'])(xtable['type'],
xtable['prot1'],
xtable['xpos1'],
xtable['prot2'],
xtable['xpos2']),
index=xtable.index).replace('nan', np.nan)
xtable['decoy'] = xtable['Decoy1'] | xtable['Decoy2']
xtable['xtype'] = np.nan
xtable['search_engine'] = 'XiSearchFDR'
xtable = hf.order_columns(xtable, col_order, compact)
return xtable
if __name__ == '__main__':
# defines the column headers required for xtable output
col_order = [ 'rawfile', 'scanno', 'prec_ch',
'pepseq1', 'xlink1',
'pepseq2', 'xlink2', 'xtype',
'modmass1', 'modpos1', 'mod1',
'modmass2', 'modpos2', 'mod2',
'prot1', 'xpos1', 'prot2',
'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']
os.chdir(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\input\xi_xiFDR')
xifdr_files = r'XiFDR_5_FDR_PSM_PSM_xiFDR1.0.22.csv'
xi_config = r'xi_config.conf'
xtable = Read(xifdr_files, xi_config, compact=False)