Source code for croco.xQuest

# -*- coding: utf-8 -*-

"""
Functions to read xQuest data.

"""


import numpy as np
import pandas as pd

import re

if __name__ == '__main__':
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf


def _process_xquest_spectrum(spec_string):
    """
    Extract rawfile name, precursor charge and scan no from xQuest spectrum
    string

    Args:
        spec_string: xQuest spectrum string
    Returns:
        str or np.nan: rawfile name
        int or np.nan: scan number
        int or np.nan: precursor charge
    """
    spectrum_pattern = re.compile('(.+)\.(\d+)\.\d+\..+\.\d+\.\d+\.(\d+)')
    if spectrum_pattern.match(spec_string):
        match = spectrum_pattern.match(spec_string)
        rawfile, scanno, prec_ch = match.groups()
        return str(rawfile), int(scanno), int(prec_ch)
    else:
        return np.nan

def _process_xquest_id(Id_string):
    """
    Extract peptide sequence of the alpha (longer) and the beta (shorter)
    peptide as well as the relative positions of the cross-links within
    these sequences from an xQuest Id-string

    Args:
        ID_string (str): an xQuest Id-String
        type (str): the xlink type from xQuest (monolink, inrtalink, xlink)
    Returns:
        str or np.nan: pepseq1
        str or np.nan: pepseq2
        int or np.nan: xlink1
        int or np.nan: xlink2
    """
    xlink_pattern = re.compile('^(\w+)-(\w+)-a(\d+)-b(\d+)')
    intralink_pattern = re.compile('^(\w+)-\D{1}(\d+)-\D{1}(\d+)')
    monolink_pattern = re.compile('^(\w+)-\D{1}(\d+)-\d+')

    if xlink_pattern.match(Id_string):
        match = xlink_pattern.match(Id_string)
        # pepseq1, pepseq2, xlink1, xlink2
        pepseq1, pepseq2, xlink1, xlink2 = match.groups()

#                return pepseq1, pepseq2, xlink1, xlink2
        return pepseq1, pepseq2, int(xlink1), int(xlink2)
    elif intralink_pattern.match(Id_string):
        match = intralink_pattern.match(Id_string)
        # pepseq1, pepseq2, xlink1, xlink2
        pepseq, xlink1, xlink2 = match.groups()

#                return pepseq, pepseq, xlink1, xlink2
        return pepseq, pepseq, int(xlink1), int(xlink2)
    elif monolink_pattern.match(Id_string):
        match = monolink_pattern.match(Id_string)
        # pepseq1, pepseq2, xlink1, xlink2
        pepseq, xlink = match.groups()

#                return pepseq, np.nan, xlink, np.nan
        return pepseq, np.nan, int(xlink), np.nan
    else:
        return np.nan, np.nan, np.nan, np.nan

def _categorize_xquest_type(XQType):
    """
    Extract protein name and absolute cross-link position from
    xQuest type string (xlink, loop, mono)

    Args:
        XQType (str): xquest type string
    Returns:
        str or np.nan: type of cross-link (inter, loop, mono)
    """

    if XQType == 'xlink':
        return 'inter'
    elif XQType == 'intralink':
        return 'loop'
    elif XQType == 'monolink':
        return 'mono'
    else:
        return np.nan


[docs]def Read(xQuest_files, col_order=None, compact=False):
    """
    Read xQuest results file and return file in xTable format.

    Args:
        xQuest_files (list): path to xQuest results file(s)
        col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable
        compact (bool): Whether to compact the xTable to only those columns listed in col_order

    Returns:
        pandas.DataFrame: xTable data table
    """

    # convert to list if the input is only a single path
    if not isinstance(xQuest_files, list):
        xQuest_files = [xQuest_files]

    allData = list()

    xQuest_dtypes = {'z': pd.Int64Dtype(),
                     'Protein1': str,
                     'Protein2': str,
                     'AbsPos1': pd.Int64Dtype(),
                     'AbsPos2': pd.Int64Dtype(),
                     'ld-Score': float}

    for file in xQuest_files:

        ### Collect data and convert to pandas format
        print('Reading xQuest-file: ' + file)

        # only called if inter_file is not None
#        try:
        s = pd.read_csv(hf.compatible_path(file),
                        delimiter='\t',
                        na_values='-',
                        dtype=xQuest_dtypes)
        allData.append(s)
#        except:
#            raise Exception('[xQuest Read] Failed opening file: {}'.format(file))

    xtable = pd.concat(allData)

    rename_dict = {'z':'prec_ch',
                   'Protein1':'prot1',
                   'Protein2': 'prot2',
                   'AbsPos1': 'xpos1',
                   'AbsPos2': 'xpos2',
                   'ld-Score': 'score'}

    # Copy and rename selected columns to new xquest df
    try:
        xtable.rename(index=str,
                      columns=rename_dict,
                      inplace=True)
    except Exception as e:
        raise Exception('[xQuest Read] Error during xQuest header renaming: %s' % e)

    # Extract rawfile, scanno and precursor charge from the mgf header string
    # used as Spectrum by xQuest
    xtable[['rawfile', 'scanno', 'prec_ch']] =\
        pd.DataFrame(xtable['Spectrum'].apply(_process_xquest_spectrum).tolist(), index=xtable.index)

    print('[xQuest Read] Processed Spectrum entry')

    # Extract peptide sequences and relative cross-link positions form the
    # xQuest ID-string
    xtable[['pepseq1', 'pepseq2', 'xlink1', 'xlink2']] =\
        pd.DataFrame(xtable['Id'].apply(_process_xquest_id).tolist(), index=xtable.index)

    print('[xQuest Read] Processed xQuest ID' )

    # Modifications are not defined in xQuest
    xtable['mod1'], xtable['mod2'] = "", ""

    # calculate the absolute position of the first amino acide of the resp
    # peptides
    xtable['pos1'] = xtable['xpos1'] - xtable['xlink1'] + 1
    xtable['pos2'] = xtable['xpos2'] - xtable['xlink2'] + 1

    print('[xQuest Read] Calculated positions')

    # Assign mono
    xtable['type'] = xtable['Type'].apply(_categorize_xquest_type)

    if len(xtable[xtable['type'] == 'inter']) > 0:
        # Reassign the type for intra and inter xlink to inter/intra/homomultimeric
        intraAndInter = (xtable['type'] == 'inter') | (xtable['type'] == 'intra')
        xtable.loc[intraAndInter, 'type'] =\
            np.vectorize(hf.categorize_inter_peptides)(xtable[intraAndInter]['prot1'],
                                                     xtable[intraAndInter]['pos1'],
                                                     xtable[intraAndInter]['pepseq1'],
                                                     xtable[intraAndInter]['prot2'],
                                                     xtable[intraAndInter]['pos2'],
                                                     xtable[intraAndInter]['pepseq2'])
        print('[xQuest Read] categorized inter peptides')
    else:
        print('[xQuest Read] skipped inter peptide categorization')

    # generate an ID for every crosslink position within the protein(s)
    xtable['ID'] =\
        pd.Series(np.vectorize(hf.generate_id,
                               otypes=['object'])(xtable['type'],
                                                  xtable['prot1'],
                                                  xtable['xpos1'],
                                                  xtable['prot2'],
                                                  xtable['xpos2']),
                 index=xtable.index).replace('nan', np.nan)

    print('[xQuest Read] Generated ID')

    # xQuest does not incorporate decoy entries in the results table
    # but protein names can contain identifiers as reverse or decoy
    xtable['decoy'] = xtable['ID'].str.contains('reverse') |\
        xtable['ID'].str.contains('decoy')

    # the following properties cannot directly be inferred from the
    # xQuest results file
    # to avoid confusion with missing valued like np.nan, they are set to
    # UNKNOWN
    for header in ['xtype', 'modmass1', 'modpos1', 'modmass2', 'modpos2']:
        xtable[header] = np.nan

    xtable['search_engine'] = 'xQuest'

    xtable = hf.order_columns(xtable, col_order, compact)

    ### Return df
    return xtable

if __name__ == '__main__':
    """
    For testing purposes only
    """

    col_order = [ 'rawfile', 'scanno', 'prec_ch',
                  'pepseq1', 'xlink1',
                  'pepseq2', 'xlink2', 'xtype',
                  'modmass1', 'modpos1', 'mod1',
                  'modmass2', 'modpos2', 'mod2',
                  'prot1', 'xpos1', 'prot2',
                  'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy']

    xtable = Read(r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\xQuest\20190227_croco_PK_xquest_results_targetdecoy.xls', col_order=col_order)