Source code for croco.pLink2

# -*- coding: utf-8 -*-
"""
Functions to read pLink2 files
"""

import pandas as pd
import os, sys
import re
import numpy as np

if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf

def _plink2_peptide2pandas(filepath):
    """
    Read a pLink peptide results file and return a pandas dictionary

    Args:
        filepath (str): Path to a pLink results file e.g. filtered_cross-linked_peptides.csv

    Returns:
        pandas.DataFrame
    """

    with open(filepath, 'r') as fh:

        # read the first header-line into list
        headers1 = fh.readline().strip().split(',')
        # remove potential empty strings due to trailing comma in plink csv file
        headers1 = [x for x in headers1 if x != '']
        # read the second header-line and remove it from list
        # avoid the first entry as it is only the line-indicator
        headers2 = fh.readline().strip().split(',')[1:]

        data = {} # init of data dict for pandas
        for h in headers1 + headers2:
            data[h] = []

        # read the first line
        line = fh.readline()
        while line:

            # avoid reading in an empty line
            if line == '':
                continue

            if line[0].isdigit(): # indicates a headers1 line
                # save the line and use it when printing all following lines
                # corresponding to that title-line
                line1_data = line.strip().split(',')

            else:
                # the first element of line2_data is empty
                line2_data = line.strip().split(',')[1:]

                # raise Ecception if e.g. the protein name contains a comma
                if (len(line1_data) != len(headers1)) or (len(line2_data) != len(headers2)):
                    raise Exception('Opening {:s} element {:s}: Number of elements in line does not correspond to number of header elements!'.format(filepath, line1_data[0]))

                # once the second line is reached, all elements are appended
                for i in range(len(line1_data)):
                    # iterate through the data and link them to their resp
                    # header by index
                    data[headers1[i]].append(line1_data[i])
                for i in range(len(line2_data)):
                    data[headers2[i]].append(line2_data[i])

            # read the next line
            line=fh.readline()

        # remove empty keys from dict
        if '' in data:
            del data['']

        try:
            data = pd.DataFrame.from_dict(data)
            if len(data) > 0:
                return data
            else:
                raise Exception('Generated xtable had a length of 0!')
        except:
            raise Exception('Could not generate xtable. Please check file at: {}'.format(filepath))  
    
def _plink2_process_title(spec_string):
    """
    Extract rawfile name, precursor charge and scan no from pLink sequence
    string such as 20180518_JB_jb05a_u50.11998.11998.3.dta

    Args:
        spec_string: pLink2 spectrum string

    Returns:
        list: [rawfile, scanno, prec_ch]
    """
    # the pattern of the title string is 20171215_JB04_Sec06.10959.10959.2
    # in pLink 2.3 and 20171215_JB04_Sec06.10959.10959.2.0 in pLink 2.1
    mgfTITLE_pattern = re.compile(hf.regexDict['mgfTITLE'])
    if mgfTITLE_pattern.match(spec_string):
        match = mgfTITLE_pattern.match(spec_string)
        rawfile, scanno, prec_ch = match.groups()
        return str(rawfile), int(scanno), int(prec_ch)
    else:
        return np.nan
    
def _plink2_process_sequence(row):
    """
    Extract peptide sequences and cross-link positions from
    pLink sequence string e.g. YVPTAGKLTVVILEAK(7)-LTVVILEAK(2):1
    Can differentiate between mono, loop and cross-link information

    Args:
        row (object): a row of a dataframe with headers 'Peptide' and 'type'

    Returns:
        list: [pepseq1, pepseq2, xpos1, xpos2, xtype]
    """

    #TODO: How to recognize heavy labels?

    xtype = 0 # all cross-links are light

    if row['type'] == 'inter':
        pattern = re.compile(r'(\w+)\((\d+)\)-(\w*)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1, pepseq2, xlink2 = match.groups()
        return str(pepseq1), int(xlink1), str(pepseq2), int(xlink2), xtype

    elif row['type'] == 'loop':
        pattern = re.compile(r'(\w+)\((\d+)\)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1, xlink2 = match.groups()
        return str(pepseq1), int(xlink1), np.nan, int(xlink2), xtype

    elif row['type'] == 'mono':
        pattern = re.compile(r'(\w+)\((\d*)\)')
        match = pattern.match(row['Peptide'])
        pepseq1, xlink1 = match.groups()
        return str(pepseq1), int(xlink1), np.nan, np.nan, xtype

    else:
        return np.nan, np.nan, np.nan, np.nan, xtype
    
def _plink2_process_protname(row):
    """
    Extract protein name and absolute cross-link position from
    pLink protein string e.g.
    sp|P63045|VAMP2_RAT(79)-sp|P63045|VAMP2_RAT(59)/
    or (worst-case)
    Stx1A(1-262)(259)-Stx1A(1-262)(259)/
    
    Args:
        row (object): a row of a dataframe with headers 'Proteins'

    Returns:
        list: [prot1, xpos1, pepseq2, xpos2]

    """

    if row['type'] == 'inter':
        pattern = re.compile('(.+?)\((\d+)\)-(.+?)\((\d*)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1, prot2, xpos2 = match.groups()

        xpos1 = int(xpos1)
        xpos2 = int(xpos2)
        prot1 = str(prot1.strip())
        prot2 = str(prot2.strip())

    elif row['type'] == 'loop':
        pattern = re.compile(r'(.+?)\((\d+)\)\((\d*)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1, xpos2 = match.groups()
        xpos1 = int(xpos1)
        xpos2 = int(xpos2)
        prot1 = str(prot1.strip())
        prot2 = prot1

    elif row['type'] == 'mono':
        pattern = re.compile(r'(.+?)\((\d+)\)/')
        match = pattern.match(row['Proteins'])
        prot1, xpos1 = match.groups()
        xpos1 = int(xpos1)
        xpos2, prot2 = [np.nan] * 2
        prot1 = str(prot1.strip())

    else:
        prot1, xpos1, pepseq2, xpos2 = [np.nan] * 4

    return prot1, xpos1, prot2, xpos2
    
def _plink2_assign_type(plinkType):
    if plinkType == 'Cross-Linked':
        return 'inter'
    elif plinkType == 'Loop-Linked':
        return 'loop'
    elif plinkType == 'Mono-Linked':
        return 'mono'
    
def _calculate_abs_pos(row):
    """
    Return the absolute position of the first AA of both peptides.
    If only one peptide present (mono-link) return NaN for the second position
    
    Args:
        row (object): a row of a dataframe with headers xlink(1/2) and xpos(1/2)

    Returns:
        list: [pos1, pos2] 
    
    """

    pos1 = int(row['xpos1']) - int(row['xlink1']) + 1

    if np.isnan(float(row['xpos2'])):
        pos2 = np.nan
    else:
        pos2 = int(row['xpos2']) - int(row['xlink2']) + 1

    return pos1, pos2
    
def _plink2_read_modifications(filepath):
    """
    Open a pLink modification.ini file and extract all modifications with
    their names as dict.

    Args:
        filepath (str): Path to modifications.ini

    Returns:
        dict: mod_dict mapping pLink modification names to masses
    """

    pattern = re.compile(r'^(.*)=\w+ \w+ (-?[0-9]\d*\.\d+)? -?[0-9]\d*\.\d+')
    mod_dict = {}

    with open(filepath, 'r') as f:
        for line in f:
            if pattern.match(line):
                match = pattern.match(line)
                name, mass = match.groups()
                mod_dict[name] = mass

    return mod_dict



[docs]def Read(plinkdirs, col_order=None, compact=False): """ Read pLink2 report dir and return an xtabel data array. Args: plinkdirs: plink2 reports subdir (reports) col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable compact (bool): Whether to compact the xTable to only those columns listed in col_order Returns: pandas.DataFrame: data table """ print('[pLink2 Read] This is pLink2 Reader') # convert to list if the input is only a single path if not isinstance(plinkdirs, list): plinkdirs = [plinkdirs] allData = list() plink_dtypes = {'Title': str, 'Peptide_Type': str, 'Peptide': str, 'Proteins': str, 'Score': float, 'Linker': str, 'Peptide_Order': int, 'Spectrum_Order': int} for file in plinkdirs: ### Collect data, convert to pandas format and merge plinkResultFiles = os.listdir(hf.compatible_path(file)) frames = [] foundPeptidesFile = False foundSpectraFile = False for xTypeStr in ['filtered_cross-linked', 'filtered_loop-linked', 'filtered_mono-linked']: dataFiles = [x for x in plinkResultFiles if xTypeStr in x] for f in dataFiles: if '_peptides.csv' in f: peptidesFile = f foundPeptidesFile = True print('Reading pLink peptide file: ' + peptidesFile) peptide_df = _plink2_peptide2pandas(hf.compatible_path(os.path.join(file, peptidesFile))) if '_spectra.csv' in f: spectraFile = f foundSpectraFile = True print('Reading pLink spectra file: ' + spectraFile) spectra_df = pd.read_csv(hf.compatible_path(os.path.join(file, spectraFile))) if foundPeptidesFile and foundSpectraFile: merge_df = pd.merge(peptide_df[['Title', 'Spectrum_Order', 'Peptide_Order']], spectra_df, on='Title') else: if foundPeptidesFile: raise Exception('[pLink2 Read] Could not find spectra file.') elif foundSpectraFile: raise Exception('[pLink2 Read] Could not find peptide file') else: raise Exception('[pLink2 Read] Couldnt find a pLink file. Did you provide the right path?') frames.append(merge_df) s = pd.concat(frames) allData.append(s) # establish a read-csv like behaviour of dtype argument for astype # astype does not accept if there are more columns supplied than found in # the data xtable = pd.concat(allData).astype(dtype=plink_dtypes) ### Convert data inside pandas df # split title column into three xtable[['rawfile', 'scanno', 'prec_ch']] =\ pd.DataFrame(xtable['Title'].apply(_plink2_process_title).tolist(), index=xtable.index) # assign the type xtable['type'] = xtable['Peptide_Type'].apply(_plink2_assign_type) # Directly assign the re group matches into new columns xtable[['pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype']] =\ pd.DataFrame(xtable.apply(_plink2_process_sequence, axis=1).tolist(), index=xtable.index) xtable[['prot1', 'xpos1', 'prot2', 'xpos2']] =\ pd.DataFrame(xtable.apply(_plink2_process_protname, axis=1).tolist(), index=xtable.index) xtable['score'] = xtable['Score'] xtable['xlinker'] = xtable['Linker'] # generate an ID for every crosslink position within the protein(s) xtable['ID'] =\ pd.Series(np.vectorize(hf.generate_id, otypes=['object'])(xtable['type'], xtable['prot1'], xtable['xpos1'], xtable['prot2'], xtable['xpos2']), index=xtable.index).replace('nan', np.nan) # calculate absolute position of first AA of peptide xtable[['pos1', 'pos2']] =\ pd.DataFrame(xtable.apply(_calculate_abs_pos, axis=1).tolist(), index=xtable.index) # add a label referring to the ordering in the pLink results table xtable['Order'] = xtable[['Peptide_Order', 'Spectrum_Order']].astype(str).apply(lambda x: ','.join(x), axis=1) # set the sequence of loop links to be the same as the corresponding pepseq1 xtable.loc[xtable['type'] == 'loop', 'pepseq2'] = xtable[xtable['type'] == 'loop']['pepseq1'] # manually set decoy to reverse as pLink hat its own internal target-decoy # algorithm xtable['decoy'] = False if len(xtable[xtable['type'] == 'inter']) > 0: # Reassign the type for intra and inter xlink to inter/intra/homomultimeric intraAndInter = (xtable['type'] == 'inter') | (xtable['type'] == 'intra') xtable.loc[intraAndInter, 'type'] =\ np.vectorize(hf.categorize_inter_peptides)(xtable[intraAndInter]['prot1'], xtable[intraAndInter]['pos1'], xtable[intraAndInter]['pepseq1'], xtable[intraAndInter]['prot2'], xtable[intraAndInter]['pos2'], xtable[intraAndInter]['pepseq2']) print('[pLink2 Read] categorized inter peptides') else: print('[pLink2 Read] skipped inter peptide categorization') ## generate the mod_dict linking pLink modification names to masses # in case of calling croco from the source folder structure... file_dir, file_name = os.path.split(__file__) print(file_dir, file_name) if os.path.exists(os.path.join(file_dir, './data/modification.ini')): modifi_dir = os.path.abspath(os.path.join(file_dir, './data/modification.ini')) # ... or calling from within a single bundled exe-file else: try: # PyInstaller creates a temp folder and stores its path in _MEIPASS base_path = sys._MEIPASS modifi_dir = os.path.abspath(\ os.path.join(base_path, './data/modification.ini')) # ... or something went wrong except: raise Exception('Modifications.ini not found') # load pLink modifications.ini from data-folder mod_dict = _plink2_read_modifications(os.path.abspath(modifi_dir)) # extract modification information pattern = re.compile(r'(.*)\((\d+)\)') pepseq1 = xtable['pepseq1'].tolist() Modifications = xtable['Modifications'].tolist() if len(pepseq1) == len(Modifications): print('[pLink2 Read] Len of pepseq1 and Modification match!') else: raise Exception('[pLink2 Read] Len of pepseq1 and Modification dont match!') modmass1 = [] mod1 = [] modpos1 = [] modmass2 = [] mod2 = [] modpos2 = [] # iterate over all lines in the input file for idx, modstr in enumerate(Modifications): this_modmass1 = [] this_mod1 = [] this_modmass2 = [] this_mod2 = [] this_modpos1 = [] this_modpos2 = [] # unmodified peptides if hf.isnan(modstr): this_modmass1 = '' this_mod1 = '' this_modpos1 = '' this_modmass2 = '' this_mod2 = '' this_modpos2 = '' else: # Extract annotations from every item in the modstring for mod in modstr.split(';'): if pattern.match(mod): match = pattern.match(mod) mod, modpos = match.groups() # transform modification names to masses try: mass = mod_dict[mod] except: # use the input string if no subsitution found mass = mod seqlen1 = len(pepseq1[idx]) # pLink assigns additional modification position to the C-term # of the first peptide, the xlinker and the N-term of the # second peptide if int(modpos) > (seqlen1 + 3): this_mod2.append(mod) this_modpos2.append(int(modpos) - (seqlen1 + 3)) this_modmass2.append(mass) # C-term of first peptide elif int(modpos) == (seqlen1 + 1): this_mod2.append(mod) this_modpos2.append(int(modpos) - 1) this_modmass2.append(mass) # cannot assign modifications to xlinker in xTable elif int(modpos) == (seqlen1 + 2): pass # Modification on N-term of second peptide elif int(modpos) == (seqlen1 + 3): this_mod2.append(mod) this_modpos2.append(1) this_modmass2.append(mass) else: this_mod1.append(mod) this_modpos1.append(modpos) this_modmass1.append(mass) # multiple modifications of one peptide are stored as lists modmass1.append(this_modmass1) mod1.append(this_mod1) modpos1.append(this_modpos1) modmass2.append(this_modmass2) mod2.append(this_mod2) modpos2.append(this_modpos2) xtable['modmass1'] = modmass1 xtable['mod1'] = mod1 xtable['modpos1'] = modpos1 xtable['modmass2'] = modmass2 xtable['mod2'] = mod2 xtable['modpos2'] = modpos2 xtable['search_engine'] = 'pLink2' xtable = hf.order_columns(xtable, col_order, compact) ### return xtable df return xtable
if __name__ == '__main__': """ For testing purposes only """ col_order = [ 'rawfile', 'scanno', 'prec_ch', 'pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype', 'modmass1', 'modpos1', 'mod1', 'modmass2', 'modpos2', 'mod2', 'prot1', 'xpos1', 'prot2', 'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy'] xtable = Read(r'C:\Users\User\Documents\02_experiments\10_p38_ag_behrens\20190415_p38_bs2g_insolution01\20190415_p38_bs2g_insolution', col_order=col_order)