commit for merge_fasta.py and merge_pg_results.py as well as updated docs

apsteinberg · apsteinberg · commit 75612dca3b8d · 2025-05-18T21:35:12.000-04:00
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -19,3 +19,31 @@ To compute AA seq coverage across different combinations of enzymes, use the `co
 
 The fragpipe output directory must contain subdirectories with the result of each enzyme,
 and these directories must be labeled starting with the enzyme name (e.g., `trypsin_diaPASEF`)
+
+Merging multiple proteomegenerator fasta
+------
+To merge across multiple proteomegenerator fasta file by merging proteins
+with the same amino acid sequence, use the `merge-fasta` feature::
+
+    tcdo_pg_tools merge-fasta -i input.csv
+
+The input.csv must have three columns: fasta, sample, condition.
+`fasta` is the path to the protein fasta file, `sample` is the sample name, and `condition` is the condition for a given sample (e.g., tumor, normal).
+
+You can use the `--upset` flag to output an upset plot likeso::
+
+    tcdo_pg_tools merge-fasta -i input.csv --upset
+
+The upset plot will be plot across the `condition` column.
+
+Merging proteomegenerator results across multiple samples
+--------
+To merge multiple proteomegenerator results, use `merge-pg-results`, which behaves in the same way
+as `merge-fasta`, except it filters on the protein.tsv output of philosopher to identify unique proteins.
+So you can run::
+
+    tcdo_pg_tools merge-pg-results -i input.csv --upset
+
+Where here the input.csv must have four columns: fasta, protein_table, sample, condition.
+`fasta` is the path to the protein fasta file, `protein_table` is the `protein.tsv` file that is output by Philosopher (in the Fragpipe output directory),
+`sample` is the sample name, and `condition` is the condition for a given sample (e.g., tumor, normal).
diff --git a/src/tcdo_pg_tools/cli.py b/src/tcdo_pg_tools/cli.py
@@ -1,7 +1,8 @@
 import click
 from tcdo_pg_tools.fusion_merge import fusion_merge
 from tcdo_pg_tools.coverage_calculator import coverage_calculator
-from tcdo_pg_tools.fasta_merge import fasta_merge
+from tcdo_pg_tools.merge_pg_results import merge_pg_results
+from tcdo_pg_tools.merge_fasta import merge_fasta
 
 @click.group()
 def cli():
@@ -10,7 +11,8 @@ def cli():
 
 cli.add_command(fusion_merge)
 cli.add_command(coverage_calculator)
-cli.add_command(fasta_merge)
+cli.add_command(merge_pg_results)
+cli.add_command(merge_fasta)
 
 if __name__ == "__main__":
     cli()
diff --git a/src/tcdo_pg_tools/merge_fasta.py b/src/tcdo_pg_tools/merge_fasta.py
@@ -0,0 +1,23 @@
+import click
+from tcdo_pg_tools.merge_pg_results import merge_pg_results
+
+@click.command()
+@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
+              help='three column csv (fasta: fasta path, '
+                   'name: sample name, condition: condition)')
+@click.option('-t', '--info_table', required=False,
+              default='info_table.tsv',
+              type=click.Path(), help="Path to index tsv for merged protein IDs")
+@click.option('-fa','--merged_fasta', required=False,
+              type=click.Path(), default='merged.fasta',
+              help="Path to merged fasta file")
+@click.option('--upset', is_flag=True, default=False, help="plot upset")
+@click.option('--upset_path', required=False,
+              type=click.Path(), default='upset_plot.svg',
+              help="Path to upset plot")
+def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path):
+    """
+    merge multiple fasta on sequence identity
+    """
+    merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False)
+    return
diff --git a/src/tcdo_pg_tools/merge_pg_results.py b/src/tcdo_pg_tools/merge_pg_results.py
@@ -1,15 +1,10 @@
 #!/usr/bin/env python3
 """
 author: Asher Preska Steinberg
-merge multiple fasta on sequence identity
+merge proteomegenerator results across multiple samples on AA seq identity
 """
 import pandas as pd
-import glob as bob
-import os
-import numpy as np
 from tqdm import tqdm
-import itertools
-from numba import njit
 import click
 from marsilea.upset import Upset, UpsetData
 import matplotlib.pyplot as plt
@@ -54,7 +49,9 @@ def plot_upset(countdat, upset_path):
 
 @click.command()
 @click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
-              help='three column format csv (path: fasta path, name: sample name, condition: condition)')
+              help='four column csv (fasta: fasta path, '
+                   'protein_table: protein.tsv path, '
+                   'name: sample name, condition: condition)')
 @click.option('-t', '--info_table', required=False,
               default='info_table.tsv',
               type=click.Path(), help="Path to index tsv for merged protein IDs")
@@ -65,17 +62,9 @@ def plot_upset(countdat, upset_path):
 @click.option('--upset_path', required=False,
               type=click.Path(), default='upset_plot.svg',
               help="Path to upset plot")
-def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
+def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=True):
     """
-    merge multiple fasta on sequence identity
-    Args:
-        input_csv: input csv with two columns (path: path to fasta, name: sample name, condition: condition (e.g., tumor, normal))
-        info_table: path to index tsv for merged protein IDs (default: info_table.tsv)
-        merged_fasta: path to merged fasta file (default: merged.fasta)
-        upset: plot upset for different conditions (default: False)
-        upset_path: path to upset plot (default: upset_plot.svg)
-    Returns:
-
+    merge proteomegenerator results across multiple samples on AA seq identity
     """
     # read in metadata
     metadata = pd.read_csv(input_csv)
@@ -89,6 +78,14 @@ def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
         # load in the protein fasta file as well
         seqdat = fasta2df(fasta, sample=sample)
         seqdat["condition"] = condition
+        # filter for unique proteins
+        if unique_proteins:
+            philosopher_path = row["protein_table"]
+            philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
+            philosopher_dat =  philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
+            unique_proteins = list(philosopher_dat["Protein"])
+            seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
+        # append sample to dataframe
         protein_dat = pd.concat([protein_dat, seqdat])
     # perform groupby
     grouped = protein_dat.groupby(by=["seq"])