Source code for croco.XiSearchFDR

# -*- coding: utf-8 -*-

"""
Functions to read Xi processed crosslink data filtered with xiFDR.

This script is part of the CroCo cross-link converter project
"""

import numpy as np
import pandas as pd

import os, re

from collections import defaultdict

if __name__ == '__main__':
    import HelperFunctions as hf
    import Xi as xi
else:
    from . import Xi as xi
    from . import HelperFunctions as hf

def _assign_type(row):
    """
    Assign mono, loop, inter and intra link
    based on prot1, prot2, xlink1 and xlink2 entries

    Args:
        row (Series): a series or list containing prot1, prot2, xlink1, xlink2
    Returns:
        str or np.nan: type of cross-link (inter, intra, loop, mono)
    """
    prot1, prot2, xlink1, xlink2 = row

    prot1 = str(prot1)
    prot2 = str(prot2)
    xlink1 = str(xlink1)
    xlink2 = str(xlink2)

    if prot2 != 'nan' and prot1 == prot2:
        t = 'intra'
    elif prot2 != 'nan':
        t = 'inter'
    elif prot2 == 'nan' and xlink2 != 'nan':
        t = 'loop'
    elif prot1 != 'nan' and prot2 == 'nan' and xlink1 != 'nan':
        t = 'mono'
    else:
        t = np.nan
    return t

def _modifications_from_sequence(sequence, moddict):
    """
    Extract a modification name and its position from a sequence containing
    the modifications as lowercase characters
    
    Args:
        sequence (str): a sequence to be parsed
        moddict (dict): a dictionary mapping symbols for modified amino acids to tuples of corresponding amino acids and their masses
    Returns:
        str: sequence without the modification characters
        list of str: modification names
        list of int: modification positions within the peptide
    """
    mods = []
    modposns = []
    modmasses = []
       
    # list containign the symbal, start position, end position of the
    # symbol in the original sequence
    found = list()
    for symbol in moddict.keys():
        if symbol in sequence:
            for m in re.finditer(symbol, sequence):
                found.append((symbol, m.start(), m.end()))
            sequence = re.sub(symbol, moddict[symbol][0], sequence)

    # sort in place
    found.sort(key=lambda x: x[1])
    
    bias = 0
    for match in found:
        mods.append(match[0])
        modmasses.append(moddict[match[0]][1])
        modposns.append(1+match[1]-bias)
        bias += len(match[0]) - len(moddict[match[0]][0])

    return sequence, mods, modposns, modmasses

def _mods_from_xi_config(xi_config):
    """
    Extract a modifications dictionary from a xi config file
    
    Args:
        xi_config (str): path to xi_config file
    
    Returns:
        dict: dictionary mapping modification symbols to a list of their unmodified counterpart and the modified mass
    """
    
    moddict = dict()
    
    modification_w_mass_pattern = re.compile(r'modification:\w+::SYMBOL:(\w+);MODIFIED:(\w+);MASS:(\d+(?:\.\d+))')
    modification_w_deltamass_pattern = re.compile(r'modification:\w+::SYMBOLEXT:(\w+);MODIFIED:([\w,]+);DELTAMASS:(\d+(?:\.\d+))')
    
    # dict with aa masses to calculate exact masses from deltamasses
    aa2mass = {'G': 57.02147,
               'A': 71.03712,
               'S': 87.03203,
               'P': 97.05277,
               'V': 99.06842,
               'T': 101.04768,
               'C': 103.00919,
               'I': 113.08407,
               'L': 113.08407,
               'N': 114.04293,
               'D': 115.02695,
               'Q': 128.05858,
               'K': 128.09497,
               'E': 129.0426,
               'M': 131.04049,
               'H': 137.05891,
               'F': 147.06842,
               'R': 156.10112,
               'Y': 163.06333,
               'W': 186.07932}
    
    with open(xi_config) as xcfg:
        for line in xcfg.readlines():
            if line.startswith('#'):
                continue
            # amino acids can contain modifications
            if line.startswith('modification:'):
                # does this line look like a modification line with exact mass
                if modification_w_mass_pattern.match(line):
                    print('Found mass instead of deltamass')
                    match = modification_w_mass_pattern.match(line)
                    symbol, aa, mass = match.groups()
                    # calculate the deltamass based on amino acids masses
                    deltamass = float(mass) - aa2mass[aa]
                    moddict[symbol] = aa, float(deltamass)
                # or does it look like a deltamass modline
                elif modification_w_deltamass_pattern.match(line):
                    match = modification_w_deltamass_pattern.match(line)
                    symbolext, aas, deltamass = match.groups()
                    # deltamass lines can contain multiple aminoacids to be modified
                    for aa in aas.split(','):
                        symbol = aa + symbolext
                        moddict[symbol] = aa, float(deltamass)
            # or monolinked cross-linkers
            elif line.startswith('crosslinker:SymetricSingleAminoAcidRestrictedCrossLinker:'):
                # at pos 57 the string "crosslinker:SymetricSingleAminoAcidRestrictedCrossLinker:" ends
                for element in line[57:].split(';'): 
                    element_key, element_val = element.split(':')
                    if element_key == 'Name':
                        xl_name = element_val
                    elif element_key == 'MASS':
                        mass = float(element_val)
                    elif element_key == 'LINKEDAMINOACIDS':
                        aas = element_val
                    elif element_key == 'MODIFICATIONS':
                        mod_symbols = list()
                        mod_masses = list()
                        for i, e in enumerate(element_val.split(',')):
                            if i%2 == 0: #even indices
                                mod_symbols.append(e)
                            else:
                                mod_masses.append(e)
                for aa in aas.split(','):
                    for idx, symbolext in enumerate(mod_symbols):
                        symbol = aa + xl_name.lower() + symbolext.lower()
                        moddict[symbol] = aa, float(mass) + float(mod_masses[idx])
                
    return moddict
        

[docs]def Read(xifdr_files, xi_config, col_order=None, compact=False):
    """
    Collects data from Xi spectrum search filtered by xiFDR and returns an xtable data array.

    Args:
        xifdr_files: path or list of paths to xiFDR file(s)
        xi_config: path to corresponding xi_config file
        col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
        compact (bool): Whether to keep the columns of the original dataframe or not

    Returns:
        xtable: xtable data table
    """
    # convert to list if the input is only a single path
    if not isinstance(xifdr_files, list):
        xifdr_files = [xifdr_files]

    allData = list()

    xifdr_dtypes = {'scan': pd.Int64Dtype(),
                   'exp charge': pd.Int64Dtype(),
                   'PepSeq1': str,
                   'PepSeq2': str,
                   'LinkPos1': pd.Int16Dtype(),
                   'LinkPos2': pd.Int16Dtype(),
                   'Protein1': str,
                   'Protein2': str,
                   'ProteinLinkPos1': pd.Int16Dtype(),
                   'ProteinLinkPos2': pd.Int16Dtype(),
                   'PepPos1': pd.Int16Dtype(),
                   'PepPos2': pd.Int16Dtype(),
                   }

    for file in xifdr_files:

        print('Reading xiFDR-file: {}'.format(file))
        try:
            s = pd.read_csv(hf.compatible_path(file), delimiter=',', dtype=xifdr_dtypes)
            allData.append(s)
        except:
            raise Exception('[xTable Read] Failed opening file: {}'.format(file))

    xtable = pd.concat(allData)

    ### Process the data to comply to xTable format

    xtable = xtable.rename(columns={#rawfile
                                    'exp charge': 'prec_ch',
                                    'LinkPos1': 'xlink1',
                                    'LinkPos2': 'xlink2',
                                    'Protein1': 'prot1',
                                    'ProteinLinkPos1': 'xpos1',
                                    'Protein2': 'prot2',
                                    'ProteinLinkPos2': 'xpos2',
                                    'PepPos1': 'pos1',
                                    'PepPos2': 'pos2',
                                    'Score': 'score'
                                    })

    # split the run column from Xi into two columns: rawfile and scanno
    xtable['rawfile'], xtable['scanno'] = xtable['run'].str.split('.', 1).str

    moddict = _mods_from_xi_config(xi_config)

    # Extract clean sequence and modificiations from the sequence string
    xtable[['pepseq1', 'mod1', 'modpos1', 'modmass1']] =\
        pd.DataFrame(xtable['PepSeq1'].apply(lambda x: _modifications_from_sequence(x, moddict)).tolist(), index=xtable.index)
    xtable[['pepseq2', 'mod2', 'modpos2', 'modmass2']] =\
        pd.DataFrame(xtable['PepSeq2'].apply(lambda x: _modifications_from_sequence(x, moddict)).tolist(), index=xtable.index)

    # assign cateogries of cross-links based on identification of prot1 and prot2
    xtable['type'] = xtable[['prot1', 'prot2', 'xlink1', 'xlink2']].apply(\
          _assign_type, axis=1)
    
    if len(xtable[xtable['type'] == 'inter']) > 0:
        # Reassign the type for inter xlink to inter/intra/homomultimeric
        onlyInter = xtable['type'] == 'inter'
        xtable.loc[onlyInter, 'type'] =\
            np.vectorize(hf.categorize_inter_peptides)(xtable[onlyInter]['prot1'],
                                                     xtable[onlyInter]['pos1'],
                                                     xtable[onlyInter]['pepseq1'],
                                                     xtable[onlyInter]['prot2'],
                                                     xtable[onlyInter]['pos2'],
                                                     xtable[onlyInter]['pepseq1'])
        print('[xiFDR Read] categorized inter peptides')
    else:
        print('[xiFDR Read] skipped inter peptide categorization')    
 
    
    # generate an ID for every crosslink position within the protein(s)
    xtable['ID'] =\
        pd.Series(np.vectorize(hf.generate_id,
                               otypes=['object'])(xtable['type'],
                                                  xtable['prot1'],
                                                  xtable['xpos1'],
                                                  xtable['prot2'],
                                                  xtable['xpos2']),
                index=xtable.index).replace('nan', np.nan)

    xtable['decoy'] = xtable['Decoy1'] | xtable['Decoy2']

    xtable['xtype'] = np.nan

    xtable['search_engine'] = 'XiSearchFDR'

    xtable = hf.order_columns(xtable, col_order, compact)
    
    return xtable

if __name__ == '__main__':

    # defines the column headers required for xtable output
    col_order = [ 'rawfile', 'scanno', 'prec_ch',
                  'pepseq1', 'xlink1',
                  'pepseq2', 'xlink2', 'xtype',
                  'modmass1', 'modpos1', 'mod1',
                  'modmass2', 'modpos2', 'mod2',
                  'prot1', 'xpos1', 'prot2',
                  'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']

    os.chdir(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\final\input\xi_xiFDR')
    xifdr_files = r'XiFDR_5_FDR_PSM_PSM_xiFDR1.0.22.csv'
    xi_config = r'xi_config.conf'
    xtable = Read(xifdr_files, xi_config, compact=False)