|
4 | 4 | merge proteomegenerator fasta and results across multiple samples on AA seq identity |
5 | 5 | """ |
6 | 6 | import os |
| 7 | +import re |
7 | 8 | import pandas as pd |
8 | 9 | from tqdm import tqdm |
9 | 10 | import click |
@@ -150,8 +151,8 @@ def merge_proteome(input_csv, info_table, merged_fasta, upset, |
150 | 151 | # also filter for decoys and contams |
151 | 152 | if filter != "": |
152 | 153 | prefixes = filter.split(",") |
153 | | - filter_list = "|".join(prefixes) |
154 | | - countdat = countdat[~countdat["protein_ids"].str.contains(filter_list)] |
| 154 | + pattern = "|".join(re.escape(p) for p in prefixes) |
| 155 | + countdat = countdat[~countdat["protein_ids"].str.contains(pattern)] |
155 | 156 | # write to merged fasta file |
156 | 157 | with open(merged_fasta, "w+") as f: |
157 | 158 | for _, row in countdat.iterrows(): |
@@ -179,46 +180,52 @@ def merge_proteome(input_csv, info_table, merged_fasta, upset, |
179 | 180 | 'name: sample name, condition: condition)') |
180 | 181 | @click.option('-t', '--info_table', required=False, |
181 | 182 | default='info_table.tsv', |
| 183 | + show_default=True, |
182 | 184 | type=click.Path(), help="Path to index tsv for merged protein IDs") |
183 | 185 | @click.option('-fa','--merged_fasta', required=False, |
184 | 186 | type=click.Path(), default='merged.fasta', |
| 187 | + show_default=True, |
185 | 188 | help="Path to merged fasta file") |
186 | 189 | @click.option('--upset', is_flag=True, default=False, help="plot upset") |
187 | 190 | @click.option('--upset_path', required=False, |
188 | 191 | type=click.Path(), default='upset_plot.svg', |
| 192 | + show_default=True, |
189 | 193 | help="Path to upset plot") |
190 | 194 | @click.option('--filter_by_header', |
191 | | - default="contam_,rev_", |
| 195 | + default="contam_,rev_,tr|GF", |
| 196 | + show_default=True, |
192 | 197 | help="filter out proteins by header prefix (provide comma separated list)") |
193 | | -def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_prefix): |
| 198 | +def merge_pg_results(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_header): |
194 | 199 | """ |
195 | 200 | merge proteomegenerator results across multiple samples on AA seq identity |
196 | 201 | """ |
197 | 202 | return merge_proteome(input_csv, info_table, merged_fasta, |
198 | | - upset, upset_path, unique_proteins=True, filter=filter_by_prefix) |
| 203 | + upset, upset_path, unique_proteins=True, filter=filter_by_header) |
199 | 204 |
|
200 | 205 | @click.command() |
201 | 206 | @click.option('-i', '--input_csv', required=True, type=click.Path(exists=True), |
202 | 207 | help='three column csv (fasta: fasta path, ' |
203 | 208 | 'name: sample name, condition: condition)') |
204 | 209 | @click.option('-t', '--info_table', required=False, |
205 | 210 | default='info_table.tsv', |
| 211 | + show_default=True, |
206 | 212 | type=click.Path(), help="Path to index tsv for merged protein IDs") |
207 | 213 | @click.option('-fa','--merged_fasta', required=False, |
208 | 214 | type=click.Path(), default='merged.fasta', |
209 | | - help="Path to merged fasta file") |
| 215 | + show_default=True, help="Path to merged fasta file") |
210 | 216 | @click.option('--upset', is_flag=True, default=False, help="plot upset") |
211 | 217 | @click.option('--upset_path', required=False, |
212 | 218 | type=click.Path(), default='upset_plot.svg', |
213 | | - help="Path to upset plot") |
| 219 | + show_default=True, help="Path to upset plot") |
214 | 220 | @click.option('--filter_by_header', |
215 | | - default="contam_,rev_", |
| 221 | + default="contam_,rev_,tr|GF", |
| 222 | + show_default=True, |
216 | 223 | help="filter out proteins by header prefix (provide comma separated list)") |
217 | | -def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_prefix): |
| 224 | +def merge_fasta(input_csv, info_table, merged_fasta, upset, upset_path, filter_by_header): |
218 | 225 | """ |
219 | 226 | merge multiple fasta on sequence identity |
220 | 227 | """ |
221 | | - return merge_proteome(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False, filter=filter_by_prefix) |
| 228 | + return merge_proteome(input_csv, info_table, merged_fasta, upset, upset_path, unique_proteins=False, filter=filter_by_header) |
222 | 229 |
|
223 | 230 | if __name__ == "__main__": |
224 | 231 | merge_pg_results() # or merge_fasta() if you're testing that |
|
0 commit comments