11#!/usr/bin/env python3
22"""
33author: Asher Preska Steinberg
4- merge multiple fasta on sequence identity
4+ merge proteomegenerator results across multiple samples on AA seq identity
55"""
66import pandas as pd
7- import glob as bob
8- import os
9- import numpy as np
107from tqdm import tqdm
11- import itertools
12- from numba import njit
138import click
149from marsilea .upset import Upset , UpsetData
1510import matplotlib .pyplot as plt
@@ -54,7 +49,9 @@ def plot_upset(countdat, upset_path):
5449
5550@click .command ()
5651@click .option ('-i' , '--input_csv' , required = True , type = click .Path (exists = True ),
57- help = 'three column format csv (path: fasta path, name: sample name, condition: condition)' )
52+ help = 'four column csv (fasta: fasta path, '
53+ 'protein_table: protein.tsv path, '
54+ 'name: sample name, condition: condition)' )
5855@click .option ('-t' , '--info_table' , required = False ,
5956 default = 'info_table.tsv' ,
6057 type = click .Path (), help = "Path to index tsv for merged protein IDs" )
@@ -65,17 +62,9 @@ def plot_upset(countdat, upset_path):
6562@click .option ('--upset_path' , required = False ,
6663 type = click .Path (), default = 'upset_plot.svg' ,
6764 help = "Path to upset plot" )
68- def fasta_merge (input_csv , info_table , merged_fasta , upset , upset_path ):
65+ def merge_pg_results (input_csv , info_table , merged_fasta , upset , upset_path , unique_proteins = True ):
6966 """
70- merge multiple fasta on sequence identity
71- Args:
72- input_csv: input csv with two columns (path: path to fasta, name: sample name, condition: condition (e.g., tumor, normal))
73- info_table: path to index tsv for merged protein IDs (default: info_table.tsv)
74- merged_fasta: path to merged fasta file (default: merged.fasta)
75- upset: plot upset for different conditions (default: False)
76- upset_path: path to upset plot (default: upset_plot.svg)
77- Returns:
78-
67+ merge proteomegenerator results across multiple samples on AA seq identity
7968 """
8069 # read in metadata
8170 metadata = pd .read_csv (input_csv )
@@ -89,6 +78,14 @@ def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
8978 # load in the protein fasta file as well
9079 seqdat = fasta2df (fasta , sample = sample )
9180 seqdat ["condition" ] = condition
81+ # filter for unique proteins
82+ if unique_proteins :
83+ philosopher_path = row ["protein_table" ]
84+ philosopher_dat = pd .read_csv (philosopher_path , sep = "\t " )
85+ philosopher_dat = philosopher_dat [ philosopher_dat ["Indistinguishable Proteins" ].isna ()]
86+ unique_proteins = list (philosopher_dat ["Protein" ])
87+ seqdat = seqdat [seqdat ["protein" ].isin (unique_proteins )]
88+ # append sample to dataframe
9289 protein_dat = pd .concat ([protein_dat , seqdat ])
9390 # perform groupby
9491 grouped = protein_dat .groupby (by = ["seq" ])
0 commit comments