Skip to content

Commit 074ad20

Browse files
committed
initial commit for fasta_merge.py
1 parent 285f4ab commit 074ad20

File tree

5 files changed

+133
-2
lines changed

5 files changed

+133
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "tcdo_pg_tools"
7-
version = "0.0.3-beta1"
7+
version = "0.0.4-beta"
88
description = "Commandline tools to support proteogenomics analyses at MSK TCDO and beyond"
99
readme = "README.rst"
1010
authors = [

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ numpy
55
click
66
tqdm
77
numba
8+
marsilea
9+
matplotlib

src/tcdo_pg_tools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
__author__ = """Asher Preska Steinberg"""
44
__email__ = '[email protected]'
5-
__version__ = '0.0.1'
5+
__version__ = '0.0.4'

src/tcdo_pg_tools/cli.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import click
22
from tcdo_pg_tools.fusion_merge import fusion_merge
33
from tcdo_pg_tools.coverage_calculator import coverage_calculator
4+
from tcdo_pg_tools.fasta_merge import fasta_merge
45

56
@click.group()
67
def cli():
@@ -9,6 +10,7 @@ def cli():
910

1011
cli.add_command(fusion_merge)
1112
cli.add_command(coverage_calculator)
13+
cli.add_command(fasta_merge)
1214

1315
if __name__ == "__main__":
1416
cli()

src/tcdo_pg_tools/fasta_merge.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/env python3
2+
"""
3+
author: Asher Preska Steinberg
4+
merge multiple fasta on sequence identity
5+
"""
6+
import pandas as pd
7+
import glob as bob
8+
import os
9+
import numpy as np
10+
from tqdm import tqdm
11+
import itertools
12+
from numba import njit
13+
import click
14+
from marsilea.upset import Upset, UpsetData
15+
import matplotlib.pyplot as plt
16+
17+
def fasta2df(uniprotfastapath, sample="swissprot"):
18+
"""
19+
fasta to pandas dataframe
20+
"""
21+
with open(uniprotfastapath, "r") as file:
22+
uniprotfasta = file.readlines()
23+
### make it a dataframe ...
24+
status = [] ## canonical, non-canonical, fusion ...
25+
IDs = []
26+
seqs = []
27+
i = 0
28+
for line in uniprotfasta:
29+
if line.startswith(">"):
30+
terms = line.split(" ")
31+
ID = terms[0]
32+
_, ID = ID.split(">")
33+
IDs.append(ID.strip())
34+
else:
35+
seqs.append(line.strip())
36+
status.append(sample)
37+
uniprotseqdat = pd.DataFrame(zip(IDs, status, seqs), columns = ["protein", "sample", "seq"])
38+
uniprotseqdat.reset_index(drop=True, inplace=True)
39+
return uniprotseqdat
40+
41+
def joinset(IDs, sort=False):
42+
ID = list(set(IDs))
43+
if sort:
44+
ID.sort()
45+
ID = ','.join(ID)
46+
return ID
47+
48+
def plot_upset(countdat, upset_path):
49+
upset_data = UpsetData.from_memberships(countdat.conditions.str.split(","))
50+
us = Upset(upset_data, min_cardinality=15)
51+
us.render()
52+
plt.savefig(upset_path, dpi=300, bbox_inches="tight")
53+
return
54+
55+
@click.command()
56+
@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True),
57+
help='three column format csv (path: fasta path, name: sample name, condition: condition)')
58+
@click.option('-t', '--info_table', required=False,
59+
default='info_table.tsv',
60+
type=click.Path(), help="Path to index tsv for merged protein IDs")
61+
@click.option('-fa','--merged_fasta', required=False,
62+
type=click.Path(), default='merged.fasta',
63+
help="Path to merged fasta file")
64+
@click.option('--upset', is_flag=True, default=False, help="plot upset")
65+
@click.option('--upset_path', required=False,
66+
type=click.Path(), default='upset_plot.svg',
67+
help="Path to upset plot")
68+
def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path):
69+
"""
70+
merge multiple fasta on sequence identity
71+
Args:
72+
input_csv: input csv with two columns (path: path to fasta, name: sample name, condition: condition (e.g., tumor, normal))
73+
info_table: path to index tsv for merged protein IDs (default: info_table.tsv)
74+
merged_fasta: path to merged fasta file (default: merged.fasta)
75+
upset: plot upset for different conditions (default: False)
76+
upset_path: path to upset plot (default: upset_plot.svg)
77+
Returns:
78+
79+
"""
80+
# read in metadata
81+
metadata = pd.read_csv(input_csv)
82+
# initialize dataframe for merging fasta
83+
protein_dat = pd.DataFrame()
84+
print("loading fasta files ...")
85+
for _, row in metadata.iterrows():
86+
fasta = row["fasta"]
87+
sample = row["sample"]
88+
condition = row["condition"]
89+
# load in the protein fasta file as well
90+
seqdat = fasta2df(fasta, sample=sample)
91+
seqdat["condition"] = condition
92+
protein_dat = pd.concat([protein_dat, seqdat])
93+
# perform groupby
94+
grouped = protein_dat.groupby(by=["seq"])
95+
# reorganize to some comprehensible output
96+
data = []
97+
i = 1
98+
print("creating final dataframe ...")
99+
for seq, group in tqdm(grouped):
100+
# get samples, conditions, protein_ids, number of occurrences
101+
samples = joinset(group["sample"])
102+
conditions = joinset(group["condition"], sort=True)
103+
protein_ids = joinset(group["protein"])
104+
# store data
105+
data.append({
106+
"sequence": seq,
107+
"Protein_ids": protein_ids,
108+
"unique_protein_id": f"PG{i}", # give protein a unique identifier
109+
"samples": samples,
110+
"conditions": conditions,
111+
"sample_count": len(group)
112+
})
113+
i = i+1
114+
# write dataframe to tsv
115+
countdat = pd.DataFrame(data)
116+
countdat.to_csv(info_table, sep="\t", index=False)
117+
# write to merged fasta file
118+
with open(merged_fasta, "w+") as f:
119+
for _, row in countdat.iterrows():
120+
id = row["unique_protein_id"]
121+
seq = row["sequence"]
122+
f.write(f">{id}\n")
123+
f.write(f"{seq}\n")
124+
# plot upset plot:
125+
if upset:
126+
plot_upset(countdat, upset_path)
127+
return

0 commit comments

Comments
 (0)