Skip to content

Commit 8350978

Browse files
committed
filter for protein headers
1 parent 519f82c commit 8350978

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

src/tcdo_pg_tools/merge_proteome.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
merge proteomegenerator fasta and results across multiple samples on AA seq identity
55
"""
66
import os
7+
import re
78
import pandas as pd
89
from tqdm import tqdm
910
import click
@@ -150,8 +151,8 @@ def merge_proteome(input_csv, info_table, merged_fasta, upset,
150151
# also filter for decoys and contams
151152
if filter != "":
152153
prefixes = filter.split(",")
153-
filter_list = "|".join(prefixes)
154-
countdat = countdat[~countdat["protein_ids"].str.contains(filter_list)]
154+
pattern = "|".join(re.escape(p) for p in prefixes)
155+
countdat = countdat[~countdat["protein_ids"].str.contains(pattern)]
155156
# write to merged fasta file
156157
with open(merged_fasta, "w+") as f:
157158
for _, row in countdat.iterrows():
@@ -179,46 +180,52 @@ def merge_proteome(input_csv, info_table, merged_fasta, upset,
179180
'name: sample name, condition: condition)')
180181
@click.option('-t', '--info_table', required=False,
181182
default='info_table.tsv',
183+
show_default=True,
182184
type=click.Path(), help="Path to index tsv for merged protein IDs")
183185
@click.option('-fa','--merged_fasta', required=False,
184186
type=click.Path(), default='merged.fasta',
187+
show_default=True,
185188
help="Path to merged fasta file")
186189
@click.option('--upset', is_flag=True, default=False, help="plot upset")
187190
@click.option('--upset_path', required=False,
188191
type=click.Path(), default='upset_plot.svg',
192+
show_default=True,
189193
help="Path to upset plot")
190194
@click.option('--filter_by_header',
191-
default="contam_,rev_",
195+
default="contam_,rev_,tr|GF",
196+
show_default=True,
192197
help="filter out proteins by header prefix (provide comma separated list)")
193-
def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_prefix):
198+
def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_header):
194199
"""
195200
merge proteomegenerator results across multiple samples on AA seq identity
196201
"""
197202
return merge_proteome(input_csv, info_table, merged_fasta,
198-
upset, upset_path, unique_proteins=True, filter=filter_by_prefix)
203+
upset, upset_path, unique_proteins=True, filter=filter_by_header)
199204

200205
@click.command()
201206
@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
202207
help='three column csv (fasta: fasta path, '
203208
'name: sample name, condition: condition)')
204209
@click.option('-t', '--info_table', required=False,
205210
default='info_table.tsv',
211+
show_default=True,
206212
type=click.Path(), help="Path to index tsv for merged protein IDs")
207213
@click.option('-fa','--merged_fasta', required=False,
208214
type=click.Path(), default='merged.fasta',
209-
help="Path to merged fasta file")
215+
show_default=True, help="Path to merged fasta file")
210216
@click.option('--upset', is_flag=True, default=False, help="plot upset")
211217
@click.option('--upset_path', required=False,
212218
type=click.Path(), default='upset_plot.svg',
213-
help="Path to upset plot")
219+
show_default=True, help="Path to upset plot")
214220
@click.option('--filter_by_header',
215-
default="contam_,rev_",
221+
default="contam_,rev_,tr|GF",
222+
show_default=True,
216223
help="filter out proteins by header prefix (provide comma separated list)")
217-
def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_prefix):
224+
def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_header):
218225
"""
219226
merge multiple fasta on sequence identity
220227
"""
221-
return merge_proteome(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False, filter=filter_by_prefix)
228+
return merge_proteome(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False, filter=filter_by_header)
222229

223230
if __name__ == "__main__":
224231
merge_pg_results() # or merge_fasta() if you're testing that

0 commit comments

Comments
 (0)