Source code for croco.Xi

# -*- coding: utf-8 -*-

"""
Functions to read Xi processed crosslink data.

"""

import numpy as np
import pandas as pd

if __name__ in ['__main__', 'Xi']:
    import HelperFunctions as hf
else:
    from . import HelperFunctions as hf


def _assign_type(row):
    """
    Assign mono, loop, inter and intra link
    based on prot1, prot2, xlink1 and xlink2 entries

    Args:
        row (Series): a series or list containing prot1, prot2, xlink1, xlink2
    Returns:
        str or np.nan: type of cross-link (inter, intra, loop, mono)
    """
    prot1, prot2, xlink1, xlink2 = row

    prot1 = str(prot1)
    prot2 = str(prot2)
    xlink1 = str(xlink1)
    xlink2 = str(xlink2)

    if prot2 != 'nan' and prot1 == prot2:
        t = 'intra'
    elif prot2 != 'nan':
        t = 'inter'
    elif prot2 == 'nan' and xlink2 != 'nan':
        t = 'loop'
    elif prot1 != 'nan' and prot2 == 'nan' and xlink1 != 'nan':
        t = 'mono'
    else:
        t = np.nan
    return t

def _rawfile_from_source(source_str):
    """
    Exctracts filename from string like
    E:\julian\20180612_croco_testfiles\mgf_msconvert\20180518_JB_jb05a_l100.mgf

    Args:
        source_str (str): Path to a rawfile
    Returns:
        str: filename from path
    """
    try:
        return source_str.split('.')[-2].split('\\')[-1]
    except AttributeError as e:
        if np.isnan(float(source_str)):
            return np.nan
        else:
            raise Exception(e)

[docs]def Read(xi_files, col_order=None, compact=False): """ Collects data from Xi spectrum search and returns an xtable data array. Args: xi_file: path or list of paths to xi file(s) col_order (list): List of xTable column titles that are used to sort and compress the resulting datatable compact (bool): Whether to compact the xTable to only those columns listed in col_order Returns: pandas.DataFrame: xtable data table """ # convert to list if the input is only a single path if not isinstance(xi_files, list): xi_files = [xi_files] allData = list() xi_dtypes = {'Scan': pd.Int64Dtype(), 'PrecoursorCharge': pd.Int64Dtype(), 'BasePeptide1': str, 'ProteinLink1': pd.Int16Dtype(), 'BasePeptide2': str, 'ProteinLink2': pd.Int16Dtype(), 'Protein1': str, 'Protein2': str, 'Start1': pd.Int32Dtype(), 'Start2': pd.Int32Dtype(), 'Link1': pd.Int16Dtype(), 'Link2': pd.Int16Dtype(), 'match score': float } for file in xi_files: print('Reading xi-file: {}'.format(file)) try: s = pd.read_csv(hf.compatible_path(file), delimiter=',', dtype=xi_dtypes) allData.append(s) except: raise Exception('[xTable Read] Failed opening file: {}'.format(file)) xtable = pd.concat(allData) ### Process the data to comply to xTable format xtable = xtable.rename(columns={'Scan': 'scanno', 'PrecoursorCharge': 'prec_ch', 'BasePeptide1': 'pepseq1', 'ProteinLink1': 'xpos1', 'BasePeptide2': 'pepseq2', 'ProteinLink2': 'xpos2', 'ModificationMasses1': 'modmass1', 'ModificationMasses2': 'modmass2', 'Modifications1': 'mod1', 'Modifications2': 'mod2', 'Protein1': 'prot1', 'Protein2': 'prot2', 'Start1': 'pos1', 'Start2': 'pos2', 'Link1': 'xlink1', 'Link2': 'xlink2', 'ModificationPositions1': 'modpos1', 'ModificationPositions2': 'modpos2', 'match score': 'score' }) xtable['rawfile'] = xtable['Source'].apply(_rawfile_from_source) # assign cateogries of cross-links based on identification of prot1 and prot2 xtable['type'] = xtable[['prot1', 'prot2', 'xlink1', 'xlink2']].apply(\ _assign_type, axis=1) # generate an ID for every crosslink position within the protein(s) xtable['ID'] =\ pd.Series(np.vectorize(hf.generate_id, otypes=['object'])(xtable['type'], xtable['prot1'], xtable['xpos1'], xtable['prot2'], xtable['xpos2']), index=xtable.index).replace('nan', np.nan) if len(xtable[xtable['type'] == 'inter']) > 0: # Reassign the type for inter xlink to inter/intra/homomultimeric onlyInter = xtable['type'] == 'inter' xtable.loc[onlyInter, 'type'] =\ np.vectorize(hf.categorize_inter_peptides)(xtable[onlyInter]['prot1'], xtable[onlyInter]['pos1'], xtable[onlyInter]['pepseq1'], xtable[onlyInter]['prot2'], xtable[onlyInter]['pos2'], xtable[onlyInter]['pepseq1']) print('[Xi Read] categorized inter peptides') else: print('[Xi Read] skipped inter peptide categorization') xtable['xtype'] = np.nan xtable['search_engine'] = 'XiSearch' xtable = hf.order_columns(xtable, col_order, compact) return xtable
if __name__ == '__main__': # defines the column headers required for xtable output col_order = [ 'rawfile', 'scanno', 'prec_ch', 'pepseq1', 'xlink1', 'pepseq2', 'xlink2', 'xtype', 'modmass1', 'modpos1', 'mod1', 'modmass2', 'modpos2', 'mod2', 'prot1', 'xpos1', 'prot2', 'xpos2', 'type', 'score', 'ID', 'pos1', 'pos2', 'decoy'] xi_file = r'C:\Users\User\Documents\03_software\python\CroCo\testdata\PK\Xi\XI_results_XiVersion1.6.739.csv' xtable = Read(xi_file)