Source code for croco.StavroX

# -*- coding: utf-8 -*-

"""
Functions to read StavroX processed crosslink data.
"""

import numpy as np
import pandas as pd

import os, re
if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf


def _type_from_proteins(protein1_string, protein2_string):
    """
    Decide whether its a mono, loop, intra or inter-protein link
    based on the fields Protein 1 and Protein 2 of a StavroX results
    table

    Args:
        protein1_string (str): A Protein1 string from StavroX
        protein2_string (str): A Protein2 string from StavroX
    Returns:
        str: type of cross-link (mono, loop, intra, inter)
    """

    if protein2_string == 'H2O':
        return 'mono'
    elif protein2_string == 'intrapeptidal':
        return 'loop'
    elif protein1_string == protein2_string:
        return 'intra'
    else:
        return 'inter'

def _clear_protname(protname_string):
    """
    Clear fasta-header and return the cleaned string. Also remove StavroX
    specific protein2-terms like H20 and intrapeptidal

    Args:
        protname_string (str): A StavroX-Protein string

    Returns:
        str: cleaned protein string (my be empty string)
    """
    if protname_string.startswith('>'):
        protname_string = protname_string.split(' ')[0]
        return protname_string[1:]
    elif protname_string == 'H2O':
        return ''
    elif protname_string == 'intrapeptidal':
        return ''
    else:
        return protname_string

def _clear_xlink(xlink_string):
    """
    Removes all non-numerical characters from a string.

    Args:
        xlink_string (str): Contains the xlink position with the linked AA
    Returns:
        int: the xlink position without the amino acid label
    """
    xlink_string = re.sub('\D+', '', xlink_string)

    return int(xlink_string)

def _calc_xpos2(t, pos1, pos2, xlink2):
    """
    Calculate the absolute position of a beta-peptide cross-link
    based on what type the xlink is

    Args:
        t(str): type of the crosslink (mono, loop, inter/intra/homomultimeric)
        pos1(int): absolute position of the first AA of the alpha-peptide
        pos2(int): absolute position of the first AA of the beta-peptide
        xlink2(int): relative position of the cross-link in the beta-peptide
    Returns:
        int: absolute position of the cross-link in the beta-peptide
    """

    try:
        if t == 'mono':
            return np.nan
        elif t == 'loop':
            return int(pos1) + int(xlink2) - 1
        else:
            return int(pos2) + int(xlink2) - 1
    except:
        return np.nan

def _mods_and_sequences_from_peptides(peptide1, peptide2, mod_dict):
    """
    Extract modification position and name from two StavroX peptide strings.
    Set peptide2 to np.nan for monolinks and to peptide1 for loop-links

    Args:
        peptide1 (str): Entry of the StavroX Peptide 1 column
        peptide2 (str): Entry of the StavroX Peptide 2 column
        mod_dict (dict): dict mapping modification abbreviations to lists of [Modified AA, Modification name, Modification mass]
    Returns:
        list: name of the modification(s) of peptide1
        list: position of the modification(s) of peptide1
        list: mass(es) of the modification(s) of peptide1
        list: name of the modification(s) of peptide2
        list: position of the modification(s) of peptide2
        list: mass(es) of the modification(s) of peptide2
    """

    if str(peptide2) == '0': # mono-link
        peptide2 = np.nan
        (mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
            _calculate_mod_modpos_modmass(peptide1), ([], [], [])
    elif str(peptide2) == '1': # loop-link
        (mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
            _calculate_mod_modpos_modmass(peptide1, mod_dict), _calculate_mod_modpos_modmass(peptide1, mod_dict)
    else: # regular inter-peptide link
        (mod1, modpos1, modmass1, sequence1), (mod2, modpos2, modmass2, sequence2) =\
            _calculate_mod_modpos_modmass(peptide1, mod_dict), _calculate_mod_modpos_modmass(peptide2, mod_dict)

    return mod1, modpos1, modmass1, sequence1, mod2, modpos2, modmass2, sequence2

def _calculate_mod_modpos_modmass(peptide_string, mod_dict):
    """
    Extraction modification, name of the modification and modification mass
    from a StavroX peptide string

    Args:
        peptide_string (str): Entry of the StavroX Peptide 1/2 column
        mod_dict (dict): dict mapping modification abbreviations to lists of [Modified AA, Modification name, Modification mass]

    Returns:
        list: name of the modification(s)
        list: position of the modification(s)
        list: mass(es) of the modification(s)
        str: sequence without modification(s)
    """

    # if there is no peptide, there are no modifications
    try:
        a_float = float(peptide_string)
        if np.isnan(a_float):
            return [], [], []
        else:
            raise Exception('Unusual peptide string recognised: {}'.format(a_float))
    except:
        modpos = []
        modmass= []
        mod = []

        sequence = ''
        for idx, char in enumerate(peptide_string):
            if char in mod_dict.keys():
                # avoid setting N- and C-terminal ends as modification
                mod.append(mod_dict[char][1])
                modmass.append(mod_dict[char][2])
                modpos.append(idx)
                char = mod_dict[char][0]
            if char in '[]{}':
                continue
            sequence += char

        return mod, modpos, modmass, sequence


def _parse_ssf(ssf_file):
    """
    Parses a StavroX properties.ssf file to extract modification identifiers
    and masses

    Args:
        ssf_file (str): path to the properties.ssf file from StavroX

    Returns:
        dict: Dict mapping modification identifiers a list of [modified AA, name of modification, modmass]
    """
    ssf_dict = {}
    current_list = None

    with open(ssf_file, 'r') as f:
        line = f.readline()
        while line:
            if 'ELEMENTS' in line:
                current_list = 'elements'
                ssf_dict[current_list] = []
            elif 'AMINOACIDS' in line:
                current_list = 'AAs'
                ssf_dict[current_list] = []
            elif 'VARMODIFICATION' in line:
                current_list = 'varmods'
                ssf_dict[current_list] = []
            elif 'STATMODIFICATION' in line:
                current_list = 'fixedmods'
                ssf_dict[current_list] = []
            elif 'END' in line:
                current_list = None
            elif current_list != None:
                ssf_dict[current_list].append(line.strip().split(';'))
            line = f.readline()

    elements2mass = dict((x, y) for x, y in ssf_dict['elements'])

    AA2formula = dict((y,z) for x,y,z in ssf_dict['AAs'])
    AA2name = dict((y,x) for x,y,z in ssf_dict['AAs'])
    
    mod_dict = dict()
    for x,y,z in ssf_dict['varmods']:
        mod_dict[y] = [x,]
    for x,y in ssf_dict['fixedmods']:
        mod_dict[y] = [x,]

    for symbol in mod_dict.keys():
        modformula = AA2formula[symbol]
        elements, stoichiometries = _elemental_composition(modformula, elements2mass)
        modmass = 0
        for element, amount in zip(elements, stoichiometries):
            modmass += int(amount) * float(elements2mass[element])
            
        unmodformula = AA2formula[mod_dict[symbol][0]]
        elements, stoichiometries = _elemental_composition(unmodformula, elements2mass)
        unmodmass = 0
        for element, amount in zip(elements, stoichiometries):
            unmodmass += int(amount) * float(elements2mass[element])
        
        mod_dict[symbol].append(AA2name[symbol])
        mod_dict[symbol].append(modmass-unmodmass)

#
#    mod2mass = {}
#    mod2name = {}
#
#    for AA in AA2formula:
#        if AA not in 'RHKDESTNQCUGPAVILMFYW':
#            formula = AA2formula[AA]
#            elements, stoichiometries = _elemental_composition(formula, elements2mass)
#            mass = 0
#            for element, amount in zip(elements, stoichiometries):
#                mass += int(amount) * float(elements2mass[element])
#
#            mod2mass[AA] = mass
#            mod2name[AA] = AA2name[AA]

    return mod_dict


def _elemental_composition(formula, elements2mass):
    """
    Compute two lists with elements and amount of atoms from
    a chemical cormula

    Args:
        formula (string): a chemical formula like C6H12N2O
        elements2mass (dict): a dict mapping elements to mass

    Returns:
        elements: a list of the elements in the formula
        stoichiometries: a list of the amounts of these elements
    """

    elements = []
    this_stoichiometrie = None
    stoichiometries = []
    collected_stoichiometry = True # set initial to true to avoid
    # setting element 1 to 1

    # formula is e.g. C6H12N2O
    for char in formula:
        # check if character is an element
        if char in elements2mass.keys():
            elements.append(char)
            # only enter if this_stoichiometrie has been filled
            if this_stoichiometrie:
                # if stoichiometrie is not given as 1
                stoichiometries.append(this_stoichiometrie)
                this_stoichiometrie = ''
            else:
                this_stoichiometrie = ''
                if collected_stoichiometry is False:
                    stoichiometries.append(1)
            collected_stoichiometry = False
        else:
            this_stoichiometrie += char
            collected_stoichiometry = True
    if collected_stoichiometry is False:
        stoichiometries.append(1)
    else:
        stoichiometries.append(this_stoichiometrie)

    return elements, stoichiometries

[docs]def Read(stavrox_files, ssf_file, col_order=None, compact=False):
    """
    Collect data from StavroX spectrum search and return an xtable data array.

    Args:
        stavrox_files: path or list of paths to StavroX output file(s)
        ssf_file: properties.ssf to load modification IDs and masses
        col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
        compact (bool): Whether to compact the xTable to only those columns listed in col_order
    Returns:
        pandas.DataFrame: xtable data table
    """

    # convert to list if the input is only a single path
    if not isinstance(stavrox_files, list):
        stavrox_files = [stavrox_files]

    allData = list()

    stavrox_dtypes = {'Scan number': str,
                      'Charge': 'int16',
                      'Protein 1 From': pd.Int64Dtype(),
                      'Protein 2 From': pd.Int64Dtype(),
                      'Protein 1': str,
                      'Protein 2': str,
                      'Peptide 1': str,
                      'Peptide 2': str,
                      'best linkage position peptide 1': str,
                      'best linkage position peptide 2': str,
                      'Score': float}

    for file in stavrox_files:

        print('Reading StavroX-file: {}'.format(file))

        try:
            with open(hf.compatible_path(file), 'r') as f:
                firstline = f.readline()

            headers = list()

            for element in firstline.split(';'):
                if element not in ['From', 'To']:
                    # there is a typo in the StavroX output files
                    if element == 'Peptide2':
                        headers.append('Peptide 2')
                        last_saved = 'Peptide 2'
                    else:
                        headers.append(element)
                        last_saved = element
                else:
                    headers.append(last_saved + ' ' + element)
            # the spectrum UUID column contains a semicolon delimiter! This messes up the whole csv reading
            headers.insert(-1, 'Spectrum UUID2')

            print(headers)

            # Reassign the column headers to avoid duplicate From and To fields
            s = pd.read_csv(hf.compatible_path(file),
                            delimiter=';',
                            header=0,
                            index_col = False,
                            dtype=stavrox_dtypes,
                            names = headers)

            allData.append(s)
        except:
            raise Exception('[StavroX Read] Failed opening file: {}'.format(file))

    xtable = pd.concat(allData)

    ### Process the data to comply to xTable format
    xtable = xtable.rename(columns={'Protein 1 From': 'pos1',
                                    'Protein 2 From': 'pos2',
                                    'Score': 'score'
                                    })

    # the field Scan number contains the mgf file header. Use Regex to extract
    # scan no
    xtable[['rawfile', 'scanno', 'prec_ch']] = xtable['Scan number'].str.extract(hf.regexDict['mgfTITLE'])

    print('[StavroX Read] Parsed MGF title')

    # calculate the type of line (i.e. mono, loop, intra or inter)
    xtable['type'] = np.vectorize(_type_from_proteins)(xtable['Protein 1'], xtable['Protein 2'])

    print('[StavroX Read] inferred type')

    # Set pos1 to 1 if Nterminal cross-link
    xtable['pos1'] = xtable['pos1'].replace(0, 1)
    xtable['pos2'] = xtable['pos2'].replace(0, 1)

    print('[StavroX Read] changed xlink positions')

    # remove for example preceding > in UniProt headers
    xtable['prot1'] = xtable['Protein 1'].apply(_clear_protname)
    xtable['prot2'] = xtable['Protein 2'].apply(_clear_protname)

    print('[StavroX Read] Cleared Protein names')

    # Best linkage position also contains the linked AA (that is already given
    # by sequence and link position)
    # --> xtract only the numerical part
    # StavroX treats the N-terminus as 0th position --> replace by 1
    xtable['xlink1'] = xtable['best linkage position peptide 1'].apply(_clear_xlink).replace(0, 1)
    xtable['xlink2'] = xtable['best linkage position peptide 2'].apply(_clear_xlink).replace(0, 1)

    print('[StavroX Read] Found xlink position')

    # calculate absolute position of xlink as sum of start of peptide
    # and relative position of the xlink
    xtable['xpos1'] = xtable['xlink1'].astype(int) + xtable['pos1'].astype(int) - 1

    # xpos2 has to be calculated separately for inter/intra, loop and mono-peptides
    xtable['xpos2'] =\
        np.vectorize(_calc_xpos2)(xtable['type'], xtable['pos1'], xtable['pos2'], xtable['xlink2'])

    print('[StavroX Read] Generated xpos')

    mod_dict = _parse_ssf(hf.compatible_path(ssf_file))

    print('[StavroX Read] parsed SSF')

    # Extract the modification mass and position from the peptide string
    xtable[['mod1', 'modpos1', 'modmass1', 'pepseq1', 'mod2', 'modpos2', 'modmass2', 'pepseq2']] =\
        pd.DataFrame(xtable[['Peptide 1', 'Peptide 2']].apply(\
            lambda row: _mods_and_sequences_from_peptides(row['Peptide 1'],
                                           row['Peptide 2'],
                                           mod_dict),
                    axis=1).tolist(), index=xtable.index)

    print('[StavroX Read] Extracted modifications and sequences')

    # generate an ID for every crosslink position within the protein(s)
    xtable['ID'] =\
        pd.Series(np.vectorize(hf.generate_id,
                               otypes=['object'])(xtable['type'],
                                                  xtable['prot1'],
                                                  xtable['xpos1'],
                                                  xtable['prot2'],
                                                  xtable['xpos2']),
                 index=xtable.index).replace('nan', np.nan)

    print('[StavroX Read] Generated ID')

    # Stavrox does not run on isotope-labeled xlinkers
    xtable['xtype'] = np.nan

    if len(xtable[xtable['type'] == 'inter']) > 0:
        # Reassign the type for inter xlink to inter/intra/homomultimeric
        onlyInter = xtable['type'] == 'inter'
        xtable.loc[onlyInter, 'type'] =\
            np.vectorize(hf.categorize_inter_peptides)(xtable[onlyInter]['prot1'],
                                                     xtable[onlyInter]['pos1'],
                                                     xtable[onlyInter]['pepseq1'],
                                                     xtable[onlyInter]['prot2'],
                                                     xtable[onlyInter]['pos2'],
                                                     xtable[onlyInter]['pepseq1'])
        print('[StavroX Read] categorized inter peptides')
    else:
        print('[StavroX Read] skipped inter peptide categorization')

    # StavroX already filters decoys
    xtable['decoy'] = False

    # reassign dtypes for every element in the df
    # errors ignore leaves the dtype as object for every
    # non-numeric element
    xtable = xtable.apply(pd.to_numeric, errors = 'ignore')

    xtable['search_engine'] = 'StavroX'

    xtable = hf.order_columns(xtable, col_order, compact)

    return xtable

if __name__ == '__main__':

    # defines the column headers required for xtable output
    col_order = [ 'rawfile', 'scanno', 'prec_ch',
                  'pepseq1', 'xlink1',
                  'pepseq2', 'xlink2', 'xtype',
                  'modmass1', 'modpos1', 'mod1',
                  'modmass2', 'modpos2', 'mod2',
                  'prot1', 'xpos1', 'prot2',
                  'xpos2',
                  'type', 'score',
                  'ID',
                  'pos1', 'pos2', 'decoy']

    stavrox_files = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\stavrox\20180615_KS_CL_9_pXtract.csv'

    ssf_file = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\stavrox\StavroxSettings.ssf'
    xtable = Read(stavrox_files, ssf_file, col_order=col_order, compact=False)