# -*- coding: utf-8 -*-
"""
Functions to write data as input for xWalk.
"""
import pandas as pd
import numpy as np
import os
if __name__ == '__main__':
import HelperFunctions as hf
else:
from . import HelperFunctions as hf
def _aminoacid_from_sequence(pepseq, xlink):
"""
Return the 3-character amino acid label of the cross-linked AA
from a peptide sequence
and the relative position of the cross-linker in the sequence
Args:
pepseq (str): peptide sequence
xlink (int): position of the cross-link within the sequence
Returns:
str: 3-letter amino acid code for the cross-linked amino acid
"""
aa_dict = {'R': 'ARG',
'H': 'HIS',
'K': 'LYS',
'D': 'ASP',
'E': 'GLU',
'S': 'SER',
'T': 'THR',
'N': 'ASN',
'Q': 'GLN',
'C': 'CYS',
'U': 'SEC',
'G': 'GLY',
'P': 'PRO',
'A': 'ALA',
'V': 'VAL',
'I': 'ILE',
'L': 'LEU',
'M': 'MET',
'F': 'PHE',
'Y': 'TYR',
'W': 'TRP'
}
try:
AA = aa_dict[pepseq[int(xlink)-1].upper()]
except:
raise Exception('[xWalk] Could not translate amino acid {} from the sequence {} into 3-letter code. Please manually correct the sequence.'.format(pepseq[int(xlink)-1].upper(), pepseq))
return AA
[docs]def Write(xtable, outpath, pdb, offset, chains, atom):
"""
Convert xTable into a list format that can be used as
input for the xWalk standalone programme.
Format is:
index \t pdb-file \t RESIDUE-NO--ATOM \t RESIDUE-NO--ATOM
As xWalk can only validate one protein at a time, the function
generates several oouput files for all intra-protein cross-links
Args:
xtable (pandas.DataFrame): data table structure
pdb (str): PDB-file name
offset (list or str): shift between PDB AA indices and the xTable
chains: (dict or str) comma separated list protein:chain allocations
atom (str): Atom identifier (e.g. CB)
outpath (str): path to write file
"""
pdbBase = os.path.basename(pdb)
if not pdbBase.endswith('.pdb'):
raise Exception('Please provide a valid PDB file')
# drop duplicates
xtable.drop_duplicates(inplace=True,
keep='first',
subset='ID')
xtable['File name'] = pdbBase
if len(atom.strip()) > 4:
raise Exception('[xWalk Write] Please provide PDB atom code with at most 4 characters')
xtable['atom'] = str(atom).upper()
chainDict = dict()
if not isinstance(chains, dict):
if isinstance(chains, str):
chains = [x.strip() for x in chains.split(',')]
try:
for annotation in chains:
if annotation != '':
protein, chain = annotation.strip().split(':')
# by generating a list of the string, all characters will be represented
# as single chain identifiers
chainDict[protein] = list(chain.upper())
except:
raise Exception('[xWalk Write] Please specify protein:chain in an comma-separated list from the GUI or as a dict')
# drop duplicates on the cross-link position as only the absolute position
# is relevant to xWalk
xtable.drop_duplicates(subset=['xpos1', 'xpos2'], keep='first', inplace=True)
# remove rows that contain NaN in prot1 or prot2 i.e. monolinks
xtable.dropna(subset=['prot1', 'prot2'], inplace=True)
# set the 3-character code for the cross-linked amino acids
xtable['linked_aa1'] = np.vectorize(_aminoacid_from_sequence)\
(xtable['pepseq1'],
xtable['xlink1'])
xtable['linked_aa2'] = np.vectorize(_aminoacid_from_sequence)\
(xtable['pepseq2'],
xtable['xlink2'])
allChainTables = list()
for proteinA in chainDict.keys():
for chainA in chainDict[proteinA]:
for proteinB in chainDict.keys():
for chainB in chainDict[proteinB]:
thisXTable = xtable[(xtable['prot1'] == proteinA) &\
(xtable['prot2'] == proteinB)][['File name',
'atom',
'pepseq1',
'pepseq2',
'xpos1',
'xpos2',
'prot1',
'prot2',
'linked_aa1',
'linked_aa2']]
thisXTable['chain1'] = chainA
thisXTable['chain2'] = chainB
allChainTables.append(thisXTable)
xWalkTable = pd.concat(allChainTables)
# to assign offsets to every protein, a single integer (one for all) or a
# dict mapping protein names to offsets is required
if not isinstance(offset, dict):
# convert the offset user-input into an integer as requried for pandas below
try:
# the input is a single integer
offset = int(offset)
except:
# the input is a list of protein:offset pair strings
try:
offset = [x.strip() for x in offset.split(',')]
except:
raise Exception('[xWalk Write] Please provide an integer offset for all chains or a list of protein:offset assignments!')
# if the offset is an integer, use it for all protein positions
if isinstance(offset, int):
xWalkTable['xpos1'] += offset
xWalkTable['xpos2'] += offset
else:
# if it is a list (see above) the protein:offset pairs are parted
if isinstance(offset, list):
offsetDict = dict()
try:
for annotation in offset:
protein, offset = annotation.strip().split(':')
offsetDict[protein] = int(offset)
except:
raise Exception('[xWalk Write] Please specify protein:offset in an comma-separated list from the GUI or as a dict')
# if it si a dict, it can directly be used
if isinstance(offset, dict):
offsetDict = offset
try:
print(offsetDict)
for pr, of in offsetDict.items():
xWalkTable.loc[xWalkTable['prot1'] == pr, 'xpos1'] += of
xWalkTable.loc[xWalkTable['prot2'] == pr, 'xpos2'] += of
except:
raise Exception('[xWalk Write] error during assignment of offsets to proteins')
atomInfo1 = list()
atomInfo2 = list()
for idx, row in xWalkTable.iterrows():
atomInfo1.append('-'.join([str(row['linked_aa1']),
str(int(row['xpos1'])),
str(row['chain1']),
str(row['atom'])]))
atomInfo2.append('-'.join([str(row['linked_aa2']),
str(int(row['xpos2'])),
str(row['chain2']),
str(row['atom'])]))
xWalkTable['Atom Info 1'] = atomInfo1
xWalkTable['Atom Info 2'] = atomInfo2
# Remove those amino acids interacting with itself (distance = 0)
xWalkTable = xWalkTable[xWalkTable['Atom Info 1'] != xWalkTable['Atom Info 2']]
xWalkTable.reset_index(inplace=True)
# increase df index by 1
xWalkTable.index = range(1,len(xWalkTable)+1)
xWalkTable.loc[:, ['File name', 'Atom Info 1', 'Atom Info 2']]\
.to_csv('{}_{}.tsv'.format(hf.compatible_path(outpath), 'xWalk'),
header=False,
index = True,
index_label = 'Index',
sep='\t')
if __name__ == '__main__':
from xTable import Read
pdb = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\1pkn.pdb'
atom = 'CB'
out = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\output\xTable_to_vis\xWalk'
chains = 'P11974:A'
offset = 'P11974:-1'
xtable = Read(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\output\all_merged_xTable_intra.xlsx')
xtable = Write(xtable=xtable, outpath=out, pdb=pdb, offset=offset, chains=chains, atom=atom)