Skip to content

Commit 2d65263

Browse files
committed
merge_pg_results.py allowing for reference fasta
1 parent 75612dc commit 2d65263

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

src/tcdo_pg_tools/merge_pg_results.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
author: Asher Preska Steinberg
44
merge proteomegenerator results across multiple samples on AA seq identity
55
"""
6+
import os
67
import pandas as pd
78
from tqdm import tqdm
89
import click
@@ -79,12 +80,18 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
7980
seqdat = fasta2df(fasta, sample=sample)
8081
seqdat["condition"] = condition
8182
# filter for unique proteins
83+
# get list of samples where no peptide tsv was provided
84+
no_quant = []
8285
if unique_proteins:
8386
philosopher_path = row["protein_table"]
84-
philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
85-
philosopher_dat = philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
86-
unique_proteins = list(philosopher_dat["Protein"])
87-
seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
87+
if type(philosopher_path) is not float and os.path.exists(philosopher_path):
88+
philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
89+
philosopher_dat = philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
90+
unique_proteins = list(philosopher_dat["Protein"])
91+
seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
92+
else:
93+
f"no protein tsv file provided for sample/condition: {sample}/{condition}"
94+
no_quant.append(sample)
8895
# append sample to dataframe
8996
protein_dat = pd.concat([protein_dat, seqdat])
9097
# perform groupby
@@ -100,7 +107,7 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
100107
protein_ids = joinset(group["protein"])
101108
# store data
102109
data.append({
103-
"sequence": seq,
110+
"sequence": seq[0],
104111
"Protein_ids": protein_ids,
105112
"unique_protein_id": f"PG{i}", # give protein a unique identifier
106113
"samples": samples,
@@ -110,6 +117,8 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
110117
i = i+1
111118
# write dataframe to tsv
112119
countdat = pd.DataFrame(data)
120+
if unique_proteins:
121+
countdat = countdat[~countdat["samples"].isin(no_quant)]
113122
countdat.to_csv(info_table, sep="\t", index=False)
114123
# write to merged fasta file
115124
with open(merged_fasta, "w+") as f:

tests/merge_pg_results/test.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fasta,protein_table,sample,condition
2+
/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_47038_proteome_CD34_comb_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/CD34+_1/FragPipe_DIA_fasta_V2/1A/out/protein.tsv,CD34+,control
3+
/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_45382_proteome_MV4-11_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/MV4-11/MA/FragPipe_MA_DIA_fasta/out/protein.tsv,MV4-11,AML
4+
/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_45642_proteome_Kasumi_1_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/Kasumi-1/KA/FragPipe_KA_DIA_fasta/out/protein.tsv,Kasumi-1,AML
5+
/Volumes/kentsis/proteomics/databases/swissprot_homosapiens/2025-02-06-reviewed-isoforms-UP000005640.fas,,SwissProt,SwissProt

0 commit comments

Comments
 (0)