Source code for croco.pLink2

# -*- coding: utf-8 -*-
"""
Functions to read pLink2 files
"""

import pandas as pd
import os, sys
import re
import numpy as np

if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf

def _plink2_peptide2pandas(filepath):
    """
    Read a pLink peptide results file and return a pandas dictionary

    Args:
        filepath (str): Path to a pLink results file e.g. filtered_cross-linked_peptides.csv

    Returns:
        pandas.DataFrame
    """

    with open(filepath, 'r') as fh:

        # read the first header-line into list
        headers1 = fh.readline().strip().split(',')
        # remove potential empty strings due to trailing comma in plink csv file
        headers1 = [x for x in headers1 if x != '']
        # read the second header-line and remove it from list
        # avoid the first entry as it is only the line-indicator
        headers2 = fh.readline().strip().split(',')[1:]

        data = {} # init of data dict for pandas
        for h in headers1 + headers2:
            data[h] = []

        # read the first line
        line = fh.readline()
        while line:

            # avoid reading in an empty line
            if line == '':
                continue

            if line[0].isdigit(): # indicates a headers1 line
                # save the line and use it when printing all following lines
                # corresponding to that title-line
                line1_data = line.strip().split(',')

            else:
                # the first element of line2_data is empty
                line2_data = line.strip().split(',')[1:]

                # raise Ecception if e.g. the protein name contains a comma
                if (len(line1_data) != len(headers1)) or (len(line2_data) != len(headers2)):
                    raise Exception('Opening {:s} element {:s}: Number of elements in line does not correspond to number of header elements!'.format(filepath, line1_data[0]))

                # once the second line is reached, all elements are appended
                for i in range(len(line1_data)):
                    # iterate through the data and link them to their resp
                    # header by index
                    data[headers1[i]].append(line1_data[i])
                for i in range(len(line2_data)):
                    data[headers2[i]].append(line2_data[i])

            # read the next line
            line=fh.readline()

        # remove empty keys from dict
        if '' in data:
            del data['']

        try:
            data = pd.DataFrame.from_dict(data)
            if len(data) > 0:
                return data
            else:
                raise Exception('Generated xtable had a length of 0!')
        except:
            raise Exception('Could not generate xtable. Please check file at: {}'.format(filepath))  
    
def _plink2_process_title(spec_string):
    """
    Extract rawfile name, precursor charge and scan no from pLink sequence
    string such as 20180518_JB_jb05a_u50.11998.11998.3.dta

    Args:
        spec_string: pLink2 spectrum string

    Returns:
        list: [rawfile, scanno, prec_ch]
    """
    # the pattern of the title string is 20171215_JB04_Sec06.10959.10959.2
    # in pLink 2.3 and 20171215_JB04_Sec06.10959.10959.2.0 in pLink 2.1
    mgfTITLE_pattern = re.compile(hf.regexDict['mgfTITLE'])
    if mgfTITLE_pattern.match(spec_string):
        match = mgfTITLE_pattern.match(spec_string)
        rawfile, scanno, prec_ch = match.groups()
        return str(rawfile), int(scanno), int(prec_ch)
    else:
        return np.nan
    
def _plink2_process_sequence(row):
    """
    Extract peptide sequences and cross-link positions from
    pLink sequence string e.g. YVPTAGKLTVVILEAK(7)-LTVVILEAK(2):1
    Can differentiate between mono, loop and cross-link information

    Args:
        row (object): a row of a dataframe with headers 'Peptide' and 'type'

    Returns:
        list: [pepseq1, pepseq2, xpos1, xpos2, xtype]
    """

    #TODO: How to recognize heavy labels?

    xtype = 0 # all cross-links are light

    if row['type'] == 'inter':
        pattern = re.compile(r'(\w+)\((\d+)\)-(\w*)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1, pepseq2, xlink2 = match.groups()
        return str(pepseq1), int(xlink1), str(pepseq2), int(xlink2), xtype

    elif row['type'] == 'loop':
        pattern = re.compile(r'(\w+)\((\d+)\)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1, xlink2 = match.groups()
        return str(pepseq1), int(xlink1), np.nan, int(xlink2), xtype

    elif row['type'] == 'mono':
        pattern = re.compile(r'(\w+)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1 = match.groups()
        return str(pepseq1), int(xlink1), np.nan, np.nan, xtype

    else:
        return np.nan, np.nan, np.nan, np.nan, xtype
    
def _plink2_process_protname(row):
    """
    Extract protein name and absolute cross-link position from
    pLink protein string e.g.
    sp|P63045|VAMP2_RAT(79)-sp|P63045|VAMP2_RAT(59)/
    or (worst-case)
    Stx1A(1-262)(259)-Stx1A(1-262)(259)/
    
    Args:
        row (object): a row of a dataframe with headers 'Proteins'

    Returns:
        list: [prot1, xpos1, pepseq2, xpos2]

    """

    if row['type'] == 'inter':
        pattern = re.compile('(.+?)\((\d+)\)-(.+?)\((\d*)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1, prot2, xpos2 = match.groups()

        xpos1 = int(xpos1)
        xpos2 = int(xpos2)
        prot1 = str(prot1.strip())
        prot2 = str(prot2.strip())

    elif row['type'] == 'loop':
        pattern = re.compile(r'(.+?)\((\d+)\)\((\d*)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1, xpos2 = match.groups()
        xpos1 = int(xpos1)
        xpos2 = int(xpos2)
        prot1 = str(prot1.strip())
        prot2 = prot1

    elif row['type'] == 'mono':
        pattern = re.compile(r'(.+?)\((\d+)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1 = match.groups()
        xpos1 = int(xpos1)
        xpos2, prot2 = [np.nan] * 2
        prot1 = str(prot1.strip())

    else:
        prot1, xpos1, pepseq2, xpos2 = [np.nan] * 4

    return prot1, xpos1, prot2, xpos2
    
def _plink2_assign_type(plinkType):
    if plinkType == 'Cross-Linked':
        return 'inter'
    elif plinkType == 'Loop-Linked':
        return 'loop'
    elif plinkType == 'Mono-Linked':
        return 'mono'
    
def _calculate_abs_pos(row):
    """
    Return the absolute position of the first AA of both peptides.
    If only one peptide present (mono-link) return NaN for the second position
    
    Args:
        row (object): a row of a dataframe with headers xlink(1/2) and xpos(1/2)

    Returns:
        list: [pos1, pos2] 
    
    """

    pos1 = int(row['xpos1']) - int(row['xlink1']) + 1

    if np.isnan(float(row['xpos2'])):
        pos2 = np.nan
    else:
        pos2 = int(row['xpos2']) - int(row['xlink2']) + 1

    return pos1, pos2
    
def _plink2_read_modifications(filepath):
    """
    Open a pLink modification.ini file and extract all modifications with
    their names as dict.

    Args:
        filepath (str): Path to modifications.ini

    Returns:
        dict: mod_dict mapping pLink modification names to masses
    """

    pattern = re.compile(r'^(.*)=\w+ \w+ (-?[0-9]\d*\.\d+)? -?[0-9]\d*\.\d+')
    mod_dict = {}

    with open(filepath, 'r') as f:
        for line in f:
            if pattern.match(line):
                match = pattern.match(line)
                name, mass = match.groups()
                mod_dict[name] = mass

    return mod_dict



[docs]def Read(plinkdirs, col_order=None, compact=False):
    """
    Read pLink2 report dir and return an xtabel data array.

    Args:
        plinkdirs: plink2 reports subdir (reports)
        col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
        compact (bool): Whether to compact the xTable to only those columns listed in col_order

    Returns:
        pandas.DataFrame: data table
    """
    print('[pLink2 Read] This is pLink2 Reader')


    # convert to list if the input is only a single path
    if not isinstance(plinkdirs, list):
        plinkdirs = [plinkdirs]
    
    allData = list()

    plink_dtypes = {'Title': str,
                    'Peptide_Type': str,
                    'Peptide': str,
                    'Proteins': str,
                    'Score': float,
                    'Linker': str,
                    'Peptide_Order': int,
                    'Spectrum_Order': int}

    for file in plinkdirs:

        ### Collect data, convert to pandas format and merge
        plinkResultFiles = os.listdir(hf.compatible_path(file))
    
        frames = []
    
        foundPeptidesFile = False
        foundSpectraFile = False
        for xTypeStr in ['filtered_cross-linked', 'filtered_loop-linked', 'filtered_mono-linked']:
            dataFiles = [x for x in plinkResultFiles if xTypeStr in x]
            for f in dataFiles:
                if '_peptides.csv' in f:
                    peptidesFile = f
                    foundPeptidesFile = True
                    
                    print('Reading pLink peptide file: ' + peptidesFile)
                    peptide_df = _plink2_peptide2pandas(hf.compatible_path(os.path.join(file, peptidesFile)))
                    
                if '_spectra.csv' in f:
                    spectraFile = f
                    foundSpectraFile = True               
     
                    print('Reading pLink spectra file: ' + spectraFile)
                    spectra_df = pd.read_csv(hf.compatible_path(os.path.join(file, spectraFile)))
    
            if foundPeptidesFile and foundSpectraFile:
                merge_df = pd.merge(peptide_df[['Title', 'Spectrum_Order', 'Peptide_Order']],
                                    spectra_df,
                                    on='Title')
            else:
                if foundPeptidesFile:
                    raise Exception('[pLink2 Read] Could not find spectra file.')
                elif foundSpectraFile:
                    raise Exception('[pLink2 Read] Could not find peptide file')
                else:
                    raise Exception('[pLink2 Read] Couldnt find a pLink file. Did you provide the right path?')
                
            frames.append(merge_df)
    
        s = pd.concat(frames)
    
        allData.append(s)

    # establish a read-csv like behaviour of dtype argument for astype
    # astype does not accept if there are more columns supplied than found in
    # the data
    xtable = pd.concat(allData).astype(dtype=plink_dtypes)
    ### Convert data inside pandas df

    # split title column into three
    xtable[['rawfile', 'scanno', 'prec_ch']] =\
        pd.DataFrame(xtable['Title'].apply(_plink2_process_title).tolist(), index=xtable.index)

    # assign the type
    xtable['type'] = xtable['Peptide_Type'].apply(_plink2_assign_type)

    # Directly assign the re group matches into new columns
    xtable[['pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype']] =\
        pd.DataFrame(xtable.apply(_plink2_process_sequence, axis=1).tolist(), index=xtable.index)

    xtable[['prot1', 'xpos1', 'prot2', 'xpos2']] =\
        pd.DataFrame(xtable.apply(_plink2_process_protname, axis=1).tolist(), index=xtable.index)

    xtable['score'] = xtable['Score']
    
    xtable['xlinker'] = xtable['Linker']

    # generate an ID for every crosslink position within the protein(s)
    xtable['ID'] =\
        pd.Series(np.vectorize(hf.generate_id,
                               otypes=['object'])(xtable['type'],
                                                  xtable['prot1'],
                                                  xtable['xpos1'],
                                                  xtable['prot2'],
                                                  xtable['xpos2']),
                 index=xtable.index).replace('nan', np.nan)

    # calculate absolute position of first AA of peptide
    xtable[['pos1', 'pos2']] =\
        pd.DataFrame(xtable.apply(_calculate_abs_pos, axis=1).tolist(), index=xtable.index)

    # add a label referring to the ordering in the pLink results table
    xtable['Order'] = xtable[['Peptide_Order', 'Spectrum_Order']].astype(str).apply(lambda x: ','.join(x), axis=1)

    # set the sequence of loop links to be the same as the corresponding pepseq1
    xtable.loc[xtable['type'] == 'loop', 'pepseq2'] = xtable[xtable['type'] == 'loop']['pepseq1']

    # manually set decoy to reverse as pLink hat its own internal target-decoy
    # algorithm
    xtable['decoy'] = False

    if len(xtable[xtable['type'] == 'inter']) > 0:
        # Reassign the type for intra and inter xlink to inter/intra/homomultimeric
        intraAndInter = (xtable['type'] == 'inter') | (xtable['type'] == 'intra')
        xtable.loc[intraAndInter, 'type'] =\
            np.vectorize(hf.categorize_inter_peptides)(xtable[intraAndInter]['prot1'],
                                                     xtable[intraAndInter]['pos1'],
                                                     xtable[intraAndInter]['pepseq1'],
                                                     xtable[intraAndInter]['prot2'],
                                                     xtable[intraAndInter]['pos2'],
                                                     xtable[intraAndInter]['pepseq2'])
        print('[pLink2 Read] categorized inter peptides')
    else:
        print('[pLink2 Read] skipped inter peptide categorization')

    ## generate the mod_dict linking pLink modification names to masses
    # in case of calling croco from the source folder structure...
    file_dir, file_name = os.path.split(__file__)
    print(file_dir, file_name)
    if os.path.exists(os.path.join(file_dir,
                                   './data/modification.ini')):
        modifi_dir = os.path.abspath(os.path.join(file_dir,
                                                  './data/modification.ini'))
    # ... or calling from within a single bundled exe-file
    else:
        try:
            # PyInstaller creates a temp folder and stores its path in _MEIPASS
            base_path = sys._MEIPASS
            modifi_dir =  os.path.abspath(\
                os.path.join(base_path, './data/modification.ini'))
        # ... or something went wrong
        except:
            raise Exception('Modifications.ini not found')

    # load pLink modifications.ini from data-folder
    mod_dict = _plink2_read_modifications(os.path.abspath(modifi_dir))

    # extract modification information
    pattern = re.compile(r'(.*)\((\d+)\)')

    pepseq1 = xtable['pepseq1'].tolist()
    Modifications = xtable['Modifications'].tolist()

    if len(pepseq1) == len(Modifications):
        print('[pLink2 Read] Len of pepseq1 and Modification match!')
    else:
        raise Exception('[pLink2 Read] Len of pepseq1 and Modification dont match!')

    modmass1 = []
    mod1 = []
    modpos1 = []
    modmass2 = []
    mod2 = []
    modpos2 = []

    # iterate over all lines in the input file
    for idx, modstr in enumerate(Modifications):

        this_modmass1 = []
        this_mod1 = []
        this_modmass2 = []
        this_mod2 = []
        this_modpos1 = []
        this_modpos2 = []

        # unmodified peptides
        if hf.isnan(modstr):
            this_modmass1 = ''
            this_mod1 = ''
            this_modpos1 = ''
            
            this_modmass2 = ''
            this_mod2 = ''
            this_modpos2 = ''
        else:
            # Extract annotations from every item in the modstring
            for mod in modstr.split(';'):
    
                if pattern.match(mod):
                    match = pattern.match(mod)
                    mod, modpos = match.groups()
    
                    # transform modification names to masses
                    try:
                        mass = mod_dict[mod]
                    except:
                        # use the input string if no subsitution found
                        mass = mod
    
                    seqlen1 = len(pepseq1[idx])
                    # pLink assigns additional modification position to the C-term
                    # of the first peptide, the xlinker and the N-term of the
                    # second peptide
                    if int(modpos) > (seqlen1 + 3):
                        this_mod2.append(mod)
                        this_modpos2.append(int(modpos) - (seqlen1 + 3))
                        this_modmass2.append(mass)
                    # C-term of first peptide
                    elif int(modpos) == (seqlen1 + 1):
                        this_mod2.append(mod)
                        this_modpos2.append(int(modpos) - 1)
                        this_modmass2.append(mass)
                    # cannot assign modifications to xlinker in xTable
                    elif int(modpos) == (seqlen1 + 2):
                        pass
                    # Modification on N-term of second peptide
                    elif int(modpos) == (seqlen1 + 3):
                        this_mod2.append(mod)
                        this_modpos2.append(1)
                        this_modmass2.append(mass)
                    else:
                        this_mod1.append(mod)
                        this_modpos1.append(modpos)
                        this_modmass1.append(mass)

        # multiple modifications of one peptide are stored as lists
        modmass1.append(this_modmass1)
        mod1.append(this_mod1)
        modpos1.append(this_modpos1)
        modmass2.append(this_modmass2)
        mod2.append(this_mod2)
        modpos2.append(this_modpos2)

    xtable['modmass1'] = modmass1
    xtable['mod1'] = mod1
    xtable['modpos1'] = modpos1
    xtable['modmass2'] = modmass2
    xtable['mod2'] = mod2
    xtable['modpos2'] = modpos2

    xtable['search_engine'] = 'pLink2'

    xtable = hf.order_columns(xtable, col_order, compact)

    ### return xtable df
    return xtable

if __name__ == '__main__':
    """
    For testing purposes only
    """

    col_order = [ 'rawfile', 'scanno', 'prec_ch',
                  'pepseq1', 'xlink1',
                  'pepseq2', 'xlink2', 'xtype',
                  'modmass1', 'modpos1', 'mod1',
                  'modmass2', 'modpos2', 'mod2',
                  'prot1', 'xpos1', 'prot2',
                  'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']

    xtable = Read(r'C:\Users\User\Documents\02_experiments\10_p38_ag_behrens\20190415_p38_bs2g_insolution01\20190415_p38_bs2g_insolution', col_order=col_order)