Source code for croco.pLink1

# -*- coding: utf-8 -*-

"""
Functions to read pLink1 data.

"""

import numpy as np
import pandas as pd

import os
import sys
import re

if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf

def _plink_protein2pandas(filepath):
    """
    Read a pLink protein results file and return a pandas dictionary.

    Args:
        filepath (str): Path to a pLink results file e.g. _inter_combine.protein.xls

    Returns:
        pandas.DataFrame
    """
    with open(filepath, 'r') as fh:

        data = {} # init of data dict for pandas

        # initialise the protein_header line
        header1 = ''
        # initialise the peptide header-line
        header2 = ''

        # protein level header starts with Order tag
        header1_exp = re.compile(r'^Order\s.*')
        # protein level entry starts with the order entry
        entry1_exp = re.compile(r'^\d+\s.*')
        # peptide level header starts with tab
        header2_exp = re.compile(r'^\s+Order.*')
        # peptide entry starts with whitespace foloowed by order entry
        entry2_exp = re.compile(r'^\s+\d+.*')

        for line in fh.readlines():

            # beginning of a protein-block
            if header1_exp.match(line):
                # only read the header-line on its first occurence
                if header1 == '':
                    # Get header names
                    header1 = line.strip().split('\t')
                    # set the column names for the dict
                    for h in header1:
                        data[h] = []

            # body of a protein-block
            elif entry1_exp.match(line):
                # Read and store header1 data to write with every case of header2
                entry1_data = line.strip().split('\t')

            # beginning of a peptide block
            elif header2_exp.match(line):
                if header2 == '':
                    header2 = line.strip().split('\t')
                    header2 = [x if x != 'Order' else 'Order2' for x in header2 ]
                    for h in header2:
                        data[h] = []

            # beginning of a peptide entry
            elif entry2_exp.match(line):
                entry2_data = line.strip().split('\t')
                # add the corresponding level1 entries
                for idx, d in enumerate(entry1_data):
                    data[header1[idx]].append(d)
                for idx, d in enumerate(entry2_data):
                    data[header2[idx]].append(d)
                    
        try:
            data = pd.DataFrame.from_dict(data)
            if len(data) > 0:
                return data
            else:
                raise Exception('Generated xtable had a length of 0!')
        except:
            raise Exception('Could not generat xtable. Please check file at: {}'.format(filepath))
    
def _read_plink_modifications(filepath):
    """
    Open a pLink modification.ini file and extract all modifications with
    their names as dict.
    
    Args:
        filepath (str): Path to modifications.ini
    
    Returns
        dict: mod_dict mapping pLink modification names to masses
    """
    
    pattern = re.compile(r'^(.*)=\w+ \w+ (-?[0-9]\d*\.\d+)? -?[0-9]\d*\.\d+')
    mod_dict = {}
    
    with open(filepath, 'r') as f:
        for line in f:
            if pattern.match(line):
                match = pattern.match(line)
                name, mass = match.groups()
                mod_dict[name] = mass

    return mod_dict
    
def _process_plink_sequence(seq_string):
    """
    Extract peptide sequences and cross-link positions from
    pLink sequence string e.g. YVPTAGKLTVVILEAK(7)-LTVVILEAK(2):1

    Args:
        seq_string (str): pLink sequence string
    Returns:
        list or np.nan: [pepseq1, pepseq2, xpos1, xpos2, xtype]
    """
    pattern = re.compile('(\w+)\((\d+)\)-(\w+)\((\d+)\):(\d+)')
    try:
        match = pattern.match(seq_string)
        pepseq1, xpos1, pepseq2, xpos2, xtype = match.groups()
        return str(pepseq1), int(xpos1), str(pepseq2), int(xpos2), xtype

    except Exception as e:
        print(e)
        return np.nan

def _process_plink_spectrum(spec_string):
    """
    Extract rawfile name, precursor charge and scan no from pLink spectrum
    string such as 2017_08_04_SVs_BS3_16.17079.17079.4.dta

    Args:
        spec_string: pLink spectrum string

    Returns:
        list or np.nan: [rawfile, scanno, prec_ch]
    """
    # the pattern of the title string is 20171215_JB04_Sec06.10959.10959.2
    # in pLink 2.3 and 20171215_JB04_Sec06.10959.10959.2.0 in pLink 2.1
    pextract_pattern = re.compile('(.+?)\.\d+\.(\d+)\.(\d+)\.*\d*')
    if pextract_pattern.match(spec_string):
        match = pextract_pattern.match(spec_string)
        rawfile, scanno, prec_ch = match.groups()
        return str(rawfile), int(scanno), int(prec_ch)
    else:
        return np.nan
    
def _process_plink_proteins(prot_string):
    """
    Extract protein name and absolute cross-link position from
    pLink protein string e.g.
    sp|P63045|VAMP2_RAT(79)-sp|P63045|VAMP2_RAT(59)
    
    Args:
        prot_string: pLink protein string
    
    Returns:
        list or np.nan: [prot1, xpos1, prot2, xpos2]
    """
    pattern = re.compile('(.+?)\((\d+)\)-?([^\(]*)\(?(\d*)\)?')
    match = pattern.match(prot_string)
    prot1, xpos1, prot2, xpos2 = match.groups()
    return str(prot1), int(xpos1), str(prot2), int(xpos2)


[docs]def Read(plinkdirs, col_order=None, compact=False): """ Read pLink report dir and return an xtabel data array. Args: plinkdirs (list): plink report subdir (e.g. sample1) col_order (list) – List of xTable column titles that are used to sort and compress the resulting datatable compact (bool): Compact the xTable to only the columns given in col_order or not Returns: pandas.DataFrame: xTable data table """ print('[pLink1 Read] This is pLink1 Reader') # convert to list if the input is only a single path if not isinstance(plinkdirs, list): plinkdirs = [plinkdirs] allData = list() plink_dtypes = {'Spectrum': str, 'Sequence': str, 'Proteins': str, 'type': str, 'Score': float, 'Order': int, 'Order2': int} for file in plinkdirs: # Initialise file names as None to use implicit booleaness inter_file = None loop_file = None mono_file = None for e in os.listdir(hf.compatible_path(file)): if '_inter_combine.protein.xls' in e: inter_file = e if '_loop_combine.protein.xls' in e: loop_file = e if '_mono_combine.protein.xls' in e: mono_file = e frames = [] # only called if inter_file is not None if inter_file: print('Reading pLink inter-file: ' + inter_file) inter_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, inter_file))) inter_df['type'] = 'inter' frames.append(inter_df) if loop_file: print('Reading pLink loop-file: ' + loop_file) loop_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, loop_file))) loop_df['type'] = 'loop' frames.append(loop_df) if mono_file: print('Reading pLink mono-file: ' + mono_file) mono_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, mono_file))) mono_df['type'] = 'mono' frames.append(mono_df) s = pd.concat(frames) allData.append(s) xtable = pd.concat(allData).astype(dtype=plink_dtypes) ### Convert data inside pandas df # rawfile, scanno, prec_ch xtable[['rawfile', 'scanno', 'prec_ch']] =\ pd.DataFrame(xtable['Spectrum'].apply(_process_plink_spectrum).tolist(), index=xtable.index) # Directly assign the re group matches into new columns xtable[['pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype']] =\ pd.DataFrame(xtable['Sequence'].apply(_process_plink_sequence).tolist(), index=xtable.index) xtable[['prot1', 'xpos1', 'prot2', 'xpos2']] =\ pd.DataFrame(xtable['Proteins'].apply(_process_plink_proteins).tolist(), index=xtable.index) xtable['score'] = xtable['Score'] # generate an ID for every crosslink position within the protein(s) xtable['ID'] =\ pd.Series(np.vectorize(hf.generate_id, otypes=['object'])(xtable['type'], xtable['prot1'], xtable['xpos1'], xtable['prot2'], xtable['xpos2']), index=xtable.index).replace('nan', np.nan) # calculate absolute position of first AA of peptide # ignoring errors avoids raising error in case on NaN -> returns NaN # as pos xtable['pos1'] = xtable['xpos1'].astype(int, errors='ignore') - \ xtable['xlink1'].astype(int, errors='ignore') + 1 xtable['pos2'] = xtable['xpos2'].astype(int, errors='ignore') - \ xtable['xlink2'].astype(int, errors='ignore') + 1 # add a lobel referring to the ordering in the pLink results table xtable['Order'] = xtable[['Order', 'Order2']].astype(str).apply(lambda x: ','.join(x), axis=1) if len(xtable[xtable['type'] == 'inter']) > 0: # Reassign the type for intra and inter xlink to inter/intra/homomultimeric intraAndInter = (xtable['type'] == 'inter') | (xtable['type'] == 'intra') xtable.loc[intraAndInter, 'type'] =\ np.vectorize(hf.categorize_inter_peptides)(xtable[intraAndInter]['prot1'], xtable[intraAndInter]['pos1'], xtable[intraAndInter]['pepseq1'], xtable[intraAndInter]['prot2'], xtable[intraAndInter]['pos2'], xtable[intraAndInter]['pepseq1']) print('[pLink Read] categorized inter peptides') else: print('[pLink Read] skipped inter peptide categorization') # manually set decoy to reverse as pLink hat its own internal target-decoy # algorithm xtable['decoy'] = False # generate the mod_dict linking pLink modification names to masses # in case of calling croco from the source folder structure... file_dir, file_name = os.path.split(__file__) if os.path.exists(os.path.join(file_dir, '../data/modification.ini')): modifi_dir = os.path.abspath(os.path.join(file_dir, '../data/modification.ini')) # ... or calling from a exe-file in a folder-setup with the data folder at top-level elif os.path.exists(os.path.join(file_dir, './data/modification.ini')): modifi_dir = os.path.abspath(os.path.join(file_dir, './data/modification.ini')) # ... or calling from within a single bundled exe-file else: try: # PyInstaller creates a temp folder and stores its path in _MEIPASS base_path = sys._MEIPASS modifi_dir = os.path.abspath(\ os.path.join(base_path, './data/modification.ini')) # ... or something went wrong except: raise Exception('Modifications.ini not found. CWD is ' + file_dir) # load pLink modifications.ini from data-folder mod_dict = _read_plink_modifications(os.path.abspath(modifi_dir)) # extract modification information pattern = re.compile(r'(\d+),.*\((.*)\)') pepseq1 = xtable['pepseq1'].tolist() Modification = xtable['Modification'].tolist() if len(pepseq1) == len(Modification): print('Len of pepseq1 and Modification match!') else: print('Len of pepseq1 and Modification dont match!') mod1 = [] modmass1 = [] modpos1 = [] modmass2 = [] modpos2 = [] mod2 = [] # iterate over all lines in the input file for idx, modstr in enumerate(Modification): this_mod1 = [] this_modmass1 = [] this_modpos1 = [] this_modmass2 = [] this_modpos2 = [] this_mod2 = [] # Extract annotations from every item in the modstring for mod in modstr.split(';'): if pattern.match(mod): match = pattern.match(mod) modpos, mod = match.groups() # transform modification names to masses try: mass = mod_dict[mod] except: # use the input string if no subsitution found mass = mod seqlen1 = len(pepseq1[idx]) # pLink assigns additional modification position to the C-term # of the first peptide, the xlinker and the N-term of the # second peptide if int(modpos) > seqlen1: this_mod2.append(mod) this_modpos2.append(int(modpos) - seqlen1) this_modmass2.append(mass) else: this_mod1.append(mod) this_modpos1.append(modpos) this_modmass1.append(mass) # multiple modifications of one peptide are stored as ;-delimited strings modmass1.append(this_modmass1) modpos1.append(this_modpos1) mod1.append(this_mod1) modmass2.append(this_modmass2) modpos2.append(this_modpos2) mod2.append(this_mod2) xtable['mod1'] = mod1 xtable['modmass1'] = modmass1 xtable['modpos1'] = modpos1 xtable['mod2'] = mod2 xtable['modmass2'] = modmass2 xtable['modpos2'] = modpos2 xtable['search_engine'] = 'pLink1' xtable = hf.order_columns(xtable, col_order, compact) ### return xtable df return xtable
if __name__ == '__main__': """ For testing purposes only """ col_order = [ 'rawfile', 'scanno', 'prec_ch', 'pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype', 'modmass1', 'modpos1', 'mod1', 'modmass2', 'modpos2', 'mod2', 'prot1', 'xpos1', 'prot2', 'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy'] xtable = Read(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\pLink1_results\2.report\sample1', col_order=col_order)