# -*- coding: utf-8 -*-
"""
Functions to read pLink1 data.
"""
import numpy as np
import pandas as pd
import os
import sys
import re
if __name__ == '__main__':
import HelperFunctions as hf
else:
from . import HelperFunctions as hf
def _plink_protein2pandas(filepath):
"""
Read a pLink protein results file and return a pandas dictionary.
Args:
filepath (str): Path to a pLink results file e.g. _inter_combine.protein.xls
Returns:
pandas.DataFrame
"""
with open(filepath, 'r') as fh:
data = {} # init of data dict for pandas
# initialise the protein_header line
header1 = ''
# initialise the peptide header-line
header2 = ''
# protein level header starts with Order tag
header1_exp = re.compile(r'^Order\s.*')
# protein level entry starts with the order entry
entry1_exp = re.compile(r'^\d+\s.*')
# peptide level header starts with tab
header2_exp = re.compile(r'^\s+Order.*')
# peptide entry starts with whitespace foloowed by order entry
entry2_exp = re.compile(r'^\s+\d+.*')
for line in fh.readlines():
# beginning of a protein-block
if header1_exp.match(line):
# only read the header-line on its first occurence
if header1 == '':
# Get header names
header1 = line.strip().split('\t')
# set the column names for the dict
for h in header1:
data[h] = []
# body of a protein-block
elif entry1_exp.match(line):
# Read and store header1 data to write with every case of header2
entry1_data = line.strip().split('\t')
# beginning of a peptide block
elif header2_exp.match(line):
if header2 == '':
header2 = line.strip().split('\t')
header2 = [x if x != 'Order' else 'Order2' for x in header2 ]
for h in header2:
data[h] = []
# beginning of a peptide entry
elif entry2_exp.match(line):
entry2_data = line.strip().split('\t')
# add the corresponding level1 entries
for idx, d in enumerate(entry1_data):
data[header1[idx]].append(d)
for idx, d in enumerate(entry2_data):
data[header2[idx]].append(d)
try:
data = pd.DataFrame.from_dict(data)
if len(data) > 0:
return data
else:
raise Exception('Generated xtable had a length of 0!')
except:
raise Exception('Could not generat xtable. Please check file at: {}'.format(filepath))
def _read_plink_modifications(filepath):
"""
Open a pLink modification.ini file and extract all modifications with
their names as dict.
Args:
filepath (str): Path to modifications.ini
Returns
dict: mod_dict mapping pLink modification names to masses
"""
pattern = re.compile(r'^(.*)=\w+ \w+ (-?[0-9]\d*\.\d+)? -?[0-9]\d*\.\d+')
mod_dict = {}
with open(filepath, 'r') as f:
for line in f:
if pattern.match(line):
match = pattern.match(line)
name, mass = match.groups()
mod_dict[name] = mass
return mod_dict
def _process_plink_sequence(seq_string):
"""
Extract peptide sequences and cross-link positions from
pLink sequence string e.g. YVPTAGKLTVVILEAK(7)-LTVVILEAK(2):1
Args:
seq_string (str): pLink sequence string
Returns:
list or np.nan: [pepseq1, pepseq2, xpos1, xpos2, xtype]
"""
pattern = re.compile('(\w+)\((\d+)\)-(\w+)\((\d+)\):(\d+)')
try:
match = pattern.match(seq_string)
pepseq1, xpos1, pepseq2, xpos2, xtype = match.groups()
return str(pepseq1), int(xpos1), str(pepseq2), int(xpos2), xtype
except Exception as e:
print(e)
return np.nan
def _process_plink_spectrum(spec_string):
"""
Extract rawfile name, precursor charge and scan no from pLink spectrum
string such as 2017_08_04_SVs_BS3_16.17079.17079.4.dta
Args:
spec_string: pLink spectrum string
Returns:
list or np.nan: [rawfile, scanno, prec_ch]
"""
# the pattern of the title string is 20171215_JB04_Sec06.10959.10959.2
# in pLink 2.3 and 20171215_JB04_Sec06.10959.10959.2.0 in pLink 2.1
pextract_pattern = re.compile('(.+?)\.\d+\.(\d+)\.(\d+)\.*\d*')
if pextract_pattern.match(spec_string):
match = pextract_pattern.match(spec_string)
rawfile, scanno, prec_ch = match.groups()
return str(rawfile), int(scanno), int(prec_ch)
else:
return np.nan
def _process_plink_proteins(prot_string):
"""
Extract protein name and absolute cross-link position from
pLink protein string e.g.
sp|P63045|VAMP2_RAT(79)-sp|P63045|VAMP2_RAT(59)
Args:
prot_string: pLink protein string
Returns:
list or np.nan: [prot1, xpos1, prot2, xpos2]
"""
pattern = re.compile('(.+?)\((\d+)\)-?([^\(]*)\(?(\d*)\)?')
match = pattern.match(prot_string)
prot1, xpos1, prot2, xpos2 = match.groups()
return str(prot1), int(xpos1), str(prot2), int(xpos2)
[docs]def Read(plinkdirs, col_order=None, compact=False):
"""
Read pLink report dir and return an xtabel data array.
Args:
plinkdirs (list): plink report subdir (e.g. sample1)
col_order (list) – List of xTable column titles that are used to sort and compress the resulting datatable
compact (bool): Compact the xTable to only the columns given in col_order or not
Returns:
pandas.DataFrame: xTable data table
"""
print('[pLink1 Read] This is pLink1 Reader')
# convert to list if the input is only a single path
if not isinstance(plinkdirs, list):
plinkdirs = [plinkdirs]
allData = list()
plink_dtypes = {'Spectrum': str,
'Sequence': str,
'Proteins': str,
'type': str,
'Score': float,
'Order': int,
'Order2': int}
for file in plinkdirs:
# Initialise file names as None to use implicit booleaness
inter_file = None
loop_file = None
mono_file = None
for e in os.listdir(hf.compatible_path(file)):
if '_inter_combine.protein.xls' in e:
inter_file = e
if '_loop_combine.protein.xls' in e:
loop_file = e
if '_mono_combine.protein.xls' in e:
mono_file = e
frames = []
# only called if inter_file is not None
if inter_file:
print('Reading pLink inter-file: ' + inter_file)
inter_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, inter_file)))
inter_df['type'] = 'inter'
frames.append(inter_df)
if loop_file:
print('Reading pLink loop-file: ' + loop_file)
loop_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, loop_file)))
loop_df['type'] = 'loop'
frames.append(loop_df)
if mono_file:
print('Reading pLink mono-file: ' + mono_file)
mono_df = _plink_protein2pandas(hf.compatible_path(os.path.join(file, mono_file)))
mono_df['type'] = 'mono'
frames.append(mono_df)
s = pd.concat(frames)
allData.append(s)
xtable = pd.concat(allData).astype(dtype=plink_dtypes)
### Convert data inside pandas df
# rawfile, scanno, prec_ch
xtable[['rawfile', 'scanno', 'prec_ch']] =\
pd.DataFrame(xtable['Spectrum'].apply(_process_plink_spectrum).tolist(), index=xtable.index)
# Directly assign the re group matches into new columns
xtable[['pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype']] =\
pd.DataFrame(xtable['Sequence'].apply(_process_plink_sequence).tolist(), index=xtable.index)
xtable[['prot1', 'xpos1', 'prot2', 'xpos2']] =\
pd.DataFrame(xtable['Proteins'].apply(_process_plink_proteins).tolist(), index=xtable.index)
xtable['score'] = xtable['Score']
# generate an ID for every crosslink position within the protein(s)
xtable['ID'] =\
pd.Series(np.vectorize(hf.generate_id,
otypes=['object'])(xtable['type'],
xtable['prot1'],
xtable['xpos1'],
xtable['prot2'],
xtable['xpos2']),
index=xtable.index).replace('nan', np.nan)
# calculate absolute position of first AA of peptide
# ignoring errors avoids raising error in case on NaN -> returns NaN
# as pos
xtable['pos1'] = xtable['xpos1'].astype(int, errors='ignore') - \
xtable['xlink1'].astype(int, errors='ignore') + 1
xtable['pos2'] = xtable['xpos2'].astype(int, errors='ignore') - \
xtable['xlink2'].astype(int, errors='ignore') + 1
# add a lobel referring to the ordering in the pLink results table
xtable['Order'] = xtable[['Order', 'Order2']].astype(str).apply(lambda x: ','.join(x), axis=1)
if len(xtable[xtable['type'] == 'inter']) > 0:
# Reassign the type for intra and inter xlink to inter/intra/homomultimeric
intraAndInter = (xtable['type'] == 'inter') | (xtable['type'] == 'intra')
xtable.loc[intraAndInter, 'type'] =\
np.vectorize(hf.categorize_inter_peptides)(xtable[intraAndInter]['prot1'],
xtable[intraAndInter]['pos1'],
xtable[intraAndInter]['pepseq1'],
xtable[intraAndInter]['prot2'],
xtable[intraAndInter]['pos2'],
xtable[intraAndInter]['pepseq1'])
print('[pLink Read] categorized inter peptides')
else:
print('[pLink Read] skipped inter peptide categorization')
# manually set decoy to reverse as pLink hat its own internal target-decoy
# algorithm
xtable['decoy'] = False
# generate the mod_dict linking pLink modification names to masses
# in case of calling croco from the source folder structure...
file_dir, file_name = os.path.split(__file__)
if os.path.exists(os.path.join(file_dir,
'../data/modification.ini')):
modifi_dir = os.path.abspath(os.path.join(file_dir,
'../data/modification.ini'))
# ... or calling from a exe-file in a folder-setup with the data folder at top-level
elif os.path.exists(os.path.join(file_dir,
'./data/modification.ini')):
modifi_dir = os.path.abspath(os.path.join(file_dir,
'./data/modification.ini'))
# ... or calling from within a single bundled exe-file
else:
try:
# PyInstaller creates a temp folder and stores its path in _MEIPASS
base_path = sys._MEIPASS
modifi_dir = os.path.abspath(\
os.path.join(base_path, './data/modification.ini'))
# ... or something went wrong
except:
raise Exception('Modifications.ini not found. CWD is ' + file_dir)
# load pLink modifications.ini from data-folder
mod_dict = _read_plink_modifications(os.path.abspath(modifi_dir))
# extract modification information
pattern = re.compile(r'(\d+),.*\((.*)\)')
pepseq1 = xtable['pepseq1'].tolist()
Modification = xtable['Modification'].tolist()
if len(pepseq1) == len(Modification):
print('Len of pepseq1 and Modification match!')
else:
print('Len of pepseq1 and Modification dont match!')
mod1 = []
modmass1 = []
modpos1 = []
modmass2 = []
modpos2 = []
mod2 = []
# iterate over all lines in the input file
for idx, modstr in enumerate(Modification):
this_mod1 = []
this_modmass1 = []
this_modpos1 = []
this_modmass2 = []
this_modpos2 = []
this_mod2 = []
# Extract annotations from every item in the modstring
for mod in modstr.split(';'):
if pattern.match(mod):
match = pattern.match(mod)
modpos, mod = match.groups()
# transform modification names to masses
try:
mass = mod_dict[mod]
except:
# use the input string if no subsitution found
mass = mod
seqlen1 = len(pepseq1[idx])
# pLink assigns additional modification position to the C-term
# of the first peptide, the xlinker and the N-term of the
# second peptide
if int(modpos) > seqlen1:
this_mod2.append(mod)
this_modpos2.append(int(modpos) - seqlen1)
this_modmass2.append(mass)
else:
this_mod1.append(mod)
this_modpos1.append(modpos)
this_modmass1.append(mass)
# multiple modifications of one peptide are stored as ;-delimited strings
modmass1.append(this_modmass1)
modpos1.append(this_modpos1)
mod1.append(this_mod1)
modmass2.append(this_modmass2)
modpos2.append(this_modpos2)
mod2.append(this_mod2)
xtable['mod1'] = mod1
xtable['modmass1'] = modmass1
xtable['modpos1'] = modpos1
xtable['mod2'] = mod2
xtable['modmass2'] = modmass2
xtable['modpos2'] = modpos2
xtable['search_engine'] = 'pLink1'
xtable = hf.order_columns(xtable, col_order, compact)
### return xtable df
return xtable
if __name__ == '__main__':
"""
For testing purposes only
"""
col_order = [ 'rawfile', 'scanno', 'prec_ch',
'pepseq1', 'xlink1',
'pepseq2', 'xlink2', 'xtype',
'modmass1', 'modpos1', 'mod1',
'modmass2', 'modpos2', 'mod2',
'prot1', 'xpos1', 'prot2',
'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']
xtable = Read(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\pLink1_results\2.report\sample1', col_order=col_order)