merge_pg_results.py allowing for reference fasta

apsteinberg · apsteinberg · commit 2d65263ec9c4 · 2025-05-27T15:58:10.000-04:00
diff --git a/src/tcdo_pg_tools/merge_pg_results.py b/src/tcdo_pg_tools/merge_pg_results.py
@@ -3,6 +3,7 @@
 author: Asher Preska Steinberg
 merge proteomegenerator results across multiple samples on AA seq identity
 """
+import os
 import pandas as pd
 from tqdm import tqdm
 import click
@@ -79,12 +80,18 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
         seqdat = fasta2df(fasta, sample=sample)
         seqdat["condition"] = condition
         # filter for unique proteins
+        # get list of samples where no peptide tsv was provided
+        no_quant = []
         if unique_proteins:
             philosopher_path = row["protein_table"]
-            philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
-            philosopher_dat =  philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
-            unique_proteins = list(philosopher_dat["Protein"])
-            seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
+            if type(philosopher_path) is not float and os.path.exists(philosopher_path):
+                philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
+                philosopher_dat =  philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
+                unique_proteins = list(philosopher_dat["Protein"])
+                seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
+            else:
+                f"no protein tsv file provided for sample/condition: {sample}/{condition}"
+                no_quant.append(sample)
         # append sample to dataframe
         protein_dat = pd.concat([protein_dat, seqdat])
     # perform groupby
@@ -100,7 +107,7 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
         protein_ids = joinset(group["protein"])
         # store data
         data.append({
-            "sequence": seq,
+            "sequence": seq[0],
             "Protein_ids": protein_ids,
             "unique_protein_id": f"PG{i}", # give protein a unique identifier
             "samples": samples,
@@ -110,6 +117,8 @@ def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, uni
         i = i+1
     # write dataframe to tsv
     countdat = pd.DataFrame(data)
+    if unique_proteins:
+        countdat = countdat[~countdat["samples"].isin(no_quant)]
     countdat.to_csv(info_table, sep="\t", index=False)
     # write to merged fasta file
     with open(merged_fasta, "w+") as f:
diff --git a/tests/merge_pg_results/test.csv b/tests/merge_pg_results/test.csv
@@ -0,0 +1,5 @@
+fasta,protein_table,sample,condition
+/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_47038_proteome_CD34_comb_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/CD34+_1/FragPipe_DIA_fasta_V2/1A/out/protein.tsv,CD34+,control
+/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_45382_proteome_MV4-11_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/MV4-11/MA/FragPipe_MA_DIA_fasta/out/protein.tsv,MV4-11,AML
+/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/fasta_files/AML_Celllines/Fasta/Celllines/PROTEOMEGENERATOR_45642_proteome_Kasumi_1_fasta.fa,/Volumes/kentsis/proteomics/data/Laura/Multi-protease_fractionation/IRIS_output/Kasumi-1/KA/FragPipe_KA_DIA_fasta/out/protein.tsv,Kasumi-1,AML
+/Volumes/kentsis/proteomics/databases/swissprot_homosapiens/2025-02-06-reviewed-isoforms-UP000005640.fas,,SwissProt,SwissProt