33author: Asher Preska Steinberg
44merge proteomegenerator results across multiple samples on AA seq identity
55"""
6+ import os
67import pandas as pd
78from tqdm import tqdm
89import click
@@ -79,12 +80,18 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
7980 seqdat = fasta2df (fasta , sample = sample )
8081 seqdat ["condition" ] = condition
8182 # filter for unique proteins
83+ # get list of samples where no peptide tsv was provided
84+ no_quant = []
8285 if unique_proteins :
8386 philosopher_path = row ["protein_table" ]
84- philosopher_dat = pd .read_csv (philosopher_path , sep = "\t " )
85- philosopher_dat = philosopher_dat [ philosopher_dat ["Indistinguishable Proteins" ].isna ()]
86- unique_proteins = list (philosopher_dat ["Protein" ])
87- seqdat = seqdat [seqdat ["protein" ].isin (unique_proteins )]
87+ if type (philosopher_path ) is not float and os .path .exists (philosopher_path ):
88+ philosopher_dat = pd .read_csv (philosopher_path , sep = "\t " )
89+ philosopher_dat = philosopher_dat [ philosopher_dat ["Indistinguishable Proteins" ].isna ()]
90+ unique_proteins = list (philosopher_dat ["Protein" ])
91+ seqdat = seqdat [seqdat ["protein" ].isin (unique_proteins )]
92+ else :
93+ f"no protein tsv file provided for sample/condition: { sample } /{ condition } "
94+ no_quant .append (sample )
8895 # append sample to dataframe
8996 protein_dat = pd .concat ([protein_dat , seqdat ])
9097 # perform groupby
@@ -100,7 +107,7 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
100107 protein_ids = joinset (group ["protein" ])
101108 # store data
102109 data .append ({
103- "sequence" : seq ,
110+ "sequence" : seq [ 0 ] ,
104111 "Protein_ids" : protein_ids ,
105112 "unique_protein_id" : f"PG{ i } " , # give protein a unique identifier
106113 "samples" : samples ,
@@ -110,6 +117,8 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
110117 i = i + 1
111118 # write dataframe to tsv
112119 countdat = pd .DataFrame (data )
120+ if unique_proteins :
121+ countdat = countdat [~ countdat ["samples" ].isin (no_quant )]
113122 countdat .to_csv (info_table , sep = "\t " , index = False )
114123 # write to merged fasta file
115124 with open (merged_fasta , "w+" ) as f :
0 commit comments