Skip to content

Commit 75612dc

Browse files
committed
commit for merge_fasta.py and merge_pg_results.py as well as updated docs
1 parent 074ad20 commit 75612dc

File tree

4 files changed

+69
-19
lines changed

4 files changed

+69
-19
lines changed

docs/usage.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,31 @@ To compute AA seq coverage across different combinations of enzymes, use the `co
1919

2020
The fragpipe output directory must contain subdirectories with the result of each enzyme,
2121
and these directories must be labeled starting with the enzyme name (e.g., `trypsin_diaPASEF`)
22+
23+
Merging multiple proteomegenerator fasta
24+
------
25+
To merge across multiple proteomegenerator fasta file by merging proteins
26+
with the same amino acid sequence, use the `merge-fasta` feature::
27+
28+
tcdo_pg_tools merge-fasta -i input.csv
29+
30+
The input.csv must have three columns: fasta, sample, condition.
31+
`fasta` is the path to the protein fasta file, `sample` is the sample name, and `condition` is the condition for a given sample (e.g., tumor, normal).
32+
33+
You can use the `--upset` flag to output an upset plot likeso::
34+
35+
tcdo_pg_tools merge-fasta -i input.csv --upset
36+
37+
The upset plot will be plot across the `condition` column.
38+
39+
Merging proteomegenerator results across multiple samples
40+
--------
41+
To merge multiple proteomegenerator results, use `merge-pg-results`, which behaves in the same way
42+
as `merge-fasta`, except it filters on the protein.tsv output of philosopher to identify unique proteins.
43+
So you can run::
44+
45+
tcdo_pg_tools merge-pg-results -i input.csv --upset
46+
47+
Where here the input.csv must have four columns: fasta, protein_table, sample, condition.
48+
`fasta` is the path to the protein fasta file, `protein_table` is the `protein.tsv` file that is output by Philosopher (in the Fragpipe output directory),
49+
`sample` is the sample name, and `condition` is the condition for a given sample (e.g., tumor, normal).

src/tcdo_pg_tools/cli.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import click
22
from tcdo_pg_tools.fusion_merge import fusion_merge
33
from tcdo_pg_tools.coverage_calculator import coverage_calculator
4-
from tcdo_pg_tools.fasta_merge import fasta_merge
4+
from tcdo_pg_tools.merge_pg_results import merge_pg_results
5+
from tcdo_pg_tools.merge_fasta import merge_fasta
56

67
@click.group()
78
def cli():
@@ -10,7 +11,8 @@ def cli():
1011

1112
cli.add_command(fusion_merge)
1213
cli.add_command(coverage_calculator)
13-
cli.add_command(fasta_merge)
14+
cli.add_command(merge_pg_results)
15+
cli.add_command(merge_fasta)
1416

1517
if __name__ == "__main__":
1618
cli()

src/tcdo_pg_tools/merge_fasta.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import click
2+
from tcdo_pg_tools.merge_pg_results import merge_pg_results
3+
4+
@click.command()
5+
@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
6+
help='three column csv (fasta: fasta path, '
7+
'name: sample name, condition: condition)')
8+
@click.option('-t', '--info_table', required=False,
9+
default='info_table.tsv',
10+
type=click.Path(), help="Path to index tsv for merged protein IDs")
11+
@click.option('-fa','--merged_fasta', required=False,
12+
type=click.Path(), default='merged.fasta',
13+
help="Path to merged fasta file")
14+
@click.option('--upset', is_flag=True, default=False, help="plot upset")
15+
@click.option('--upset_path', required=False,
16+
type=click.Path(), default='upset_plot.svg',
17+
help="Path to upset plot")
18+
def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path):
19+
"""
20+
merge multiple fasta on sequence identity
21+
"""
22+
merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False)
23+
return
Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,10 @@
11
#!/usr/bin/env python3
22
"""
33
author: Asher Preska Steinberg
4-
merge multiple fasta on sequence identity
4+
merge proteomegenerator results across multiple samples on AA seq identity
55
"""
66
import pandas as pd
7-
import glob as bob
8-
import os
9-
import numpy as np
107
from tqdm import tqdm
11-
import itertools
12-
from numba import njit
138
import click
149
from marsilea.upset import Upset, UpsetData
1510
import matplotlib.pyplot as plt
@@ -54,7 +49,9 @@ def plot_upset(countdat, upset_path):
5449

5550
@click.command()
5651
@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
57-
help='three column format csv (path: fasta path, name: sample name, condition: condition)')
52+
help='four column csv (fasta: fasta path, '
53+
'protein_table: protein.tsv path, '
54+
'name: sample name, condition: condition)')
5855
@click.option('-t', '--info_table', required=False,
5956
default='info_table.tsv',
6057
type=click.Path(), help="Path to index tsv for merged protein IDs")
@@ -65,17 +62,9 @@ def plot_upset(countdat, upset_path):
6562
@click.option('--upset_path', required=False,
6663
type=click.Path(), default='upset_plot.svg',
6764
help="Path to upset plot")
68-
def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
65+
def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=True):
6966
"""
70-
merge multiple fasta on sequence identity
71-
Args:
72-
input_csv: input csv with two columns (path: path to fasta, name: sample name, condition: condition (e.g., tumor, normal))
73-
info_table: path to index tsv for merged protein IDs (default: info_table.tsv)
74-
merged_fasta: path to merged fasta file (default: merged.fasta)
75-
upset: plot upset for different conditions (default: False)
76-
upset_path: path to upset plot (default: upset_plot.svg)
77-
Returns:
78-
67+
merge proteomegenerator results across multiple samples on AA seq identity
7968
"""
8069
# read in metadata
8170
metadata = pd.read_csv(input_csv)
@@ -89,6 +78,14 @@ def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
8978
# load in the protein fasta file as well
9079
seqdat = fasta2df(fasta, sample=sample)
9180
seqdat["condition"] = condition
81+
# filter for unique proteins
82+
if unique_proteins:
83+
philosopher_path = row["protein_table"]
84+
philosopher_dat = pd.read_csv(philosopher_path, sep="\t")
85+
philosopher_dat = philosopher_dat[ philosopher_dat["Indistinguishable Proteins"].isna()]
86+
unique_proteins = list(philosopher_dat["Protein"])
87+
seqdat = seqdat[seqdat["protein"].isin(unique_proteins)]
88+
# append sample to dataframe
9289
protein_dat = pd.concat([protein_dat, seqdat])
9390
# perform groupby
9491
grouped = protein_dat.groupby(by=["seq"])

0 commit comments

Comments
 (0)