Source code for croco.xWalk

# -*- coding: utf-8 -*-

"""
Functions to write data as input for xWalk.

"""

import pandas as pd
import numpy as np
import os

if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf

def _aminoacid_from_sequence(pepseq, xlink):
    """
    Return the 3-character amino acid label of the cross-linked AA
    from a peptide sequence
    and the relative position of the cross-linker in the sequence
    
    Args:
        pepseq (str): peptide sequence
        xlink (int): position of the cross-link within the sequence
    Returns:
        str: 3-letter amino acid code for the cross-linked amino acid
    """

    aa_dict = {'R': 'ARG',
           'H': 'HIS',
           'K': 'LYS',
           'D': 'ASP',
           'E': 'GLU',
           'S': 'SER',
           'T': 'THR',
           'N': 'ASN',
           'Q': 'GLN',
           'C': 'CYS',
           'U': 'SEC',
           'G': 'GLY',
           'P': 'PRO',
           'A': 'ALA',
           'V': 'VAL',
           'I': 'ILE',
           'L': 'LEU',
           'M': 'MET',
           'F': 'PHE',
           'Y': 'TYR',
           'W': 'TRP'
           }

    try:
        AA = aa_dict[pepseq[int(xlink)-1].upper()]
    except:
        raise Exception('[xWalk] Could not translate amino acid {} from the sequence {} into 3-letter code. Please manually correct the sequence.'.format(pepseq[int(xlink)-1].upper(), pepseq))

    return AA


[docs]def Write(xtable, outpath, pdb, offset, chains, atom):
    """
    Convert xTable into a list format that can be used as
    input for the xWalk standalone programme.

    Format is:

    index \t pdb-file \t RESIDUE-NO--ATOM \t RESIDUE-NO--ATOM

    As xWalk can only validate one protein at a time, the function
    generates several oouput files for all intra-protein cross-links

    Args:
        xtable (pandas.DataFrame): data table structure
        pdb (str): PDB-file name
        offset (list or str): shift between PDB AA indices and the xTable
        chains: (dict or str) comma separated list protein:chain allocations
        atom (str): Atom identifier (e.g. CB)
        outpath (str): path to write file
    """

    pdbBase = os.path.basename(pdb)

    if not pdbBase.endswith('.pdb'):
        raise Exception('Please provide a valid PDB file')

    # drop duplicates
    xtable.drop_duplicates(inplace=True,
                           keep='first',
                           subset='ID')


    xtable['File name'] = pdbBase

    if len(atom.strip()) > 4:
        raise Exception('[xWalk Write] Please provide PDB atom code with at most 4 characters')

    xtable['atom'] = str(atom).upper()

    chainDict = dict()
    
    if not isinstance(chains, dict):
        if isinstance(chains, str):
            chains = [x.strip() for x in chains.split(',')]
        try:
            for annotation in chains:
                if annotation != '':
                    protein, chain = annotation.strip().split(':')
                    # by generating a list of the string, all characters will be represented
                    # as single chain identifiers
                    chainDict[protein] = list(chain.upper())
        except:
            raise Exception('[xWalk Write] Please specify protein:chain in an comma-separated list from the GUI or as a dict')

    # drop duplicates on the cross-link position as only the absolute position
    # is relevant to xWalk
    xtable.drop_duplicates(subset=['xpos1', 'xpos2'], keep='first', inplace=True)

    # remove rows that contain NaN in prot1 or prot2 i.e. monolinks
    xtable.dropna(subset=['prot1', 'prot2'], inplace=True)

    # set the 3-character code for the cross-linked amino acids
    xtable['linked_aa1'] = np.vectorize(_aminoacid_from_sequence)\
        (xtable['pepseq1'],
         xtable['xlink1'])

    xtable['linked_aa2'] = np.vectorize(_aminoacid_from_sequence)\
        (xtable['pepseq2'],
         xtable['xlink2'])

    allChainTables = list()

    for proteinA in chainDict.keys():
        for chainA in chainDict[proteinA]:
            for proteinB in chainDict.keys():
                for chainB in chainDict[proteinB]:
                    thisXTable = xtable[(xtable['prot1'] == proteinA) &\
                                        (xtable['prot2'] == proteinB)][['File name',
                                                                        'atom',
                                                                        'pepseq1',
                                                                        'pepseq2',
                                                                        'xpos1',
                                                                        'xpos2',
                                                                        'prot1',
                                                                        'prot2',
                                                                        'linked_aa1',
                                                                        'linked_aa2']]
                    thisXTable['chain1'] = chainA
                    thisXTable['chain2'] = chainB

                    allChainTables.append(thisXTable)

    xWalkTable = pd.concat(allChainTables)

    # to assign offsets to every protein, a single integer (one for all) or a 
    # dict mapping protein names to offsets is required
    if not isinstance(offset, dict):
        # convert the offset user-input into an integer as requried for pandas below
        try:
            # the input is a single integer
            offset = int(offset)
        except:
            # the input is a list of protein:offset pair strings
            try:
                offset = [x.strip() for x in offset.split(',')]
            except:
                raise Exception('[xWalk Write] Please provide an integer offset for all chains or a list of protein:offset assignments!')
    
    # if the offset is an integer, use it for all protein positions
    if isinstance(offset, int):
        xWalkTable['xpos1'] += offset
        xWalkTable['xpos2'] += offset
    else:
        # if it is a list (see above) the protein:offset pairs are parted
        if isinstance(offset, list):
            offsetDict = dict()
            try:
                for annotation in offset:
                    protein, offset = annotation.strip().split(':')
                    offsetDict[protein] = int(offset)
            except:
                raise Exception('[xWalk Write] Please specify protein:offset in an comma-separated list from the GUI or as a dict')
        # if it si a dict, it can directly be used
        if isinstance(offset, dict):
            offsetDict = offset
    
        try:
            print(offsetDict)
            for pr, of in offsetDict.items():
                xWalkTable.loc[xWalkTable['prot1'] == pr, 'xpos1'] += of
                xWalkTable.loc[xWalkTable['prot2'] == pr, 'xpos2'] += of
        except:
            raise Exception('[xWalk Write] error during assignment of offsets to proteins')

    atomInfo1 = list()
    atomInfo2 = list()

    for idx, row in xWalkTable.iterrows():
        atomInfo1.append('-'.join([str(row['linked_aa1']),
                                   str(int(row['xpos1'])),
                                   str(row['chain1']),
                                   str(row['atom'])]))

        atomInfo2.append('-'.join([str(row['linked_aa2']),
                                   str(int(row['xpos2'])),
                                   str(row['chain2']),
                                   str(row['atom'])]))

    xWalkTable['Atom Info 1'] = atomInfo1
    xWalkTable['Atom Info 2'] = atomInfo2

    # Remove those amino acids interacting with itself (distance = 0)
    xWalkTable = xWalkTable[xWalkTable['Atom Info 1'] != xWalkTable['Atom Info 2']]

    xWalkTable.reset_index(inplace=True)
    # increase df index by 1
    xWalkTable.index = range(1,len(xWalkTable)+1)

    xWalkTable.loc[:, ['File name', 'Atom Info 1', 'Atom Info 2']]\
        .to_csv('{}_{}.tsv'.format(hf.compatible_path(outpath), 'xWalk'),
                                   header=False,
                                   index = True,
                                   index_label = 'Index',
                                   sep='\t')

if __name__ == '__main__':
    from xTable import Read

    pdb = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\1pkn.pdb'
    atom = 'CB'
    out = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\output\xTable_to_vis\xWalk'
    chains = 'P11974:A'
    offset = 'P11974:-1'

    xtable = Read(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\output\all_merged_xTable_intra.xlsx')

    xtable = Write(xtable=xtable, outpath=out, pdb=pdb, offset=offset, chains=chains, atom=atom)