|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +author: Asher Preska Steinberg |
| 4 | +merge multiple fasta on sequence identity |
| 5 | +""" |
| 6 | +import pandas as pd |
| 7 | +import glob as bob |
| 8 | +import os |
| 9 | +import numpy as np |
| 10 | +from tqdm import tqdm |
| 11 | +import itertools |
| 12 | +from numba import njit |
| 13 | +import click |
| 14 | +from marsilea.upset import Upset, UpsetData |
| 15 | +import matplotlib.pyplot as plt |
| 16 | + |
| 17 | +def fasta2df(uniprotfastapath, sample="swissprot"): |
| 18 | + """ |
| 19 | + fasta to pandas dataframe |
| 20 | + """ |
| 21 | + with open(uniprotfastapath, "r") as file: |
| 22 | + uniprotfasta = file.readlines() |
| 23 | + ### make it a dataframe ... |
| 24 | + status = [] ## canonical, non-canonical, fusion ... |
| 25 | + IDs = [] |
| 26 | + seqs = [] |
| 27 | + i = 0 |
| 28 | + for line in uniprotfasta: |
| 29 | + if line.startswith(">"): |
| 30 | + terms = line.split(" ") |
| 31 | + ID = terms[0] |
| 32 | + _, ID = ID.split(">") |
| 33 | + IDs.append(ID.strip()) |
| 34 | + else: |
| 35 | + seqs.append(line.strip()) |
| 36 | + status.append(sample) |
| 37 | + uniprotseqdat = pd.DataFrame(zip(IDs, status, seqs), columns = ["protein", "sample", "seq"]) |
| 38 | + uniprotseqdat.reset_index(drop=True, inplace=True) |
| 39 | + return uniprotseqdat |
| 40 | + |
| 41 | +def joinset(IDs, sort=False): |
| 42 | + ID = list(set(IDs)) |
| 43 | + if sort: |
| 44 | + ID.sort() |
| 45 | + ID = ','.join(ID) |
| 46 | + return ID |
| 47 | + |
| 48 | +def plot_upset(countdat, upset_path): |
| 49 | + upset_data = UpsetData.from_memberships(countdat.conditions.str.split(",")) |
| 50 | + us = Upset(upset_data, min_cardinality=15) |
| 51 | + us.render() |
| 52 | + plt.savefig(upset_path, dpi=300, bbox_inches="tight") |
| 53 | + return |
| 54 | + |
| 55 | +@click.command() |
| 56 | +@click.option('-i', '--input_csv', required=True, type=click.Path(exists=True), |
| 57 | + help='three column format csv (path: fasta path, name: sample name, condition: condition)') |
| 58 | +@click.option('-t', '--info_table', required=False, |
| 59 | + default='info_table.tsv', |
| 60 | + type=click.Path(), help="Path to index tsv for merged protein IDs") |
| 61 | +@click.option('-fa','--merged_fasta', required=False, |
| 62 | + type=click.Path(), default='merged.fasta', |
| 63 | + help="Path to merged fasta file") |
| 64 | +@click.option('--upset', is_flag=True, default=False, help="plot upset") |
| 65 | +@click.option('--upset_path', required=False, |
| 66 | + type=click.Path(), default='upset_plot.svg', |
| 67 | + help="Path to upset plot") |
| 68 | +def fasta_merge(input_csv, info_table, merged_fasta, upset, upset_path): |
| 69 | + """ |
| 70 | + merge multiple fasta on sequence identity |
| 71 | + Args: |
| 72 | + input_csv: input csv with two columns (path: path to fasta, name: sample name, condition: condition (e.g., tumor, normal)) |
| 73 | + info_table: path to index tsv for merged protein IDs (default: info_table.tsv) |
| 74 | + merged_fasta: path to merged fasta file (default: merged.fasta) |
| 75 | + upset: plot upset for different conditions (default: False) |
| 76 | + upset_path: path to upset plot (default: upset_plot.svg) |
| 77 | + Returns: |
| 78 | +
|
| 79 | + """ |
| 80 | + # read in metadata |
| 81 | + metadata = pd.read_csv(input_csv) |
| 82 | + # initialize dataframe for merging fasta |
| 83 | + protein_dat = pd.DataFrame() |
| 84 | + print("loading fasta files ...") |
| 85 | + for _, row in metadata.iterrows(): |
| 86 | + fasta = row["fasta"] |
| 87 | + sample = row["sample"] |
| 88 | + condition = row["condition"] |
| 89 | + # load in the protein fasta file as well |
| 90 | + seqdat = fasta2df(fasta, sample=sample) |
| 91 | + seqdat["condition"] = condition |
| 92 | + protein_dat = pd.concat([protein_dat, seqdat]) |
| 93 | + # perform groupby |
| 94 | + grouped = protein_dat.groupby(by=["seq"]) |
| 95 | + # reorganize to some comprehensible output |
| 96 | + data = [] |
| 97 | + i = 1 |
| 98 | + print("creating final dataframe ...") |
| 99 | + for seq, group in tqdm(grouped): |
| 100 | + # get samples, conditions, protein_ids, number of occurrences |
| 101 | + samples = joinset(group["sample"]) |
| 102 | + conditions = joinset(group["condition"], sort=True) |
| 103 | + protein_ids = joinset(group["protein"]) |
| 104 | + # store data |
| 105 | + data.append({ |
| 106 | + "sequence": seq, |
| 107 | + "Protein_ids": protein_ids, |
| 108 | + "unique_protein_id": f"PG{i}", # give protein a unique identifier |
| 109 | + "samples": samples, |
| 110 | + "conditions": conditions, |
| 111 | + "sample_count": len(group) |
| 112 | + }) |
| 113 | + i = i+1 |
| 114 | + # write dataframe to tsv |
| 115 | + countdat = pd.DataFrame(data) |
| 116 | + countdat.to_csv(info_table, sep="\t", index=False) |
| 117 | + # write to merged fasta file |
| 118 | + with open(merged_fasta, "w+") as f: |
| 119 | + for _, row in countdat.iterrows(): |
| 120 | + id = row["unique_protein_id"] |
| 121 | + seq = row["sequence"] |
| 122 | + f.write(f">{id}\n") |
| 123 | + f.write(f"{seq}\n") |
| 124 | + # plot upset plot: |
| 125 | + if upset: |
| 126 | + plot_upset(countdat, upset_path) |
| 127 | + return |
0 commit comments