TranscriptResolver/transcriptResolver.py at master · junyinglim/TranscriptResolver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

import csv
import os
import itertools as it
import numpy as np
import re # regular expressions
import argparse #For command line arguments

from consensus_tools import * # custom functions to run transcript resolving
from collections import defaultdict # utility functions to create dictionaries
import pandas as pd # data frame functionality
from fuzzywuzzy import process, fuzz # Functions that are useful for fuzzy string matching (https://github.com/seatgeek/fuzzywuzzy)
from functools import reduce # for the reduce function

import webbrowser
import copy

class transcriptResolver:
    def __init__(self, args): # __init__ always run when an instance of the class is created

        ## Define stem name ========================
        if args.stem:
            self.stem = args.stem

        else:
            print("\nPlease input a 'stem' name to act as a prefix to all output (e.g., stemname_calbug.csv)")
            self.stem = input("Stem name: ")

        self.stem = self.stem + "_"

        print("\nStem name for all outputs will be '" + self.stem + "'")

        ## Define working directory ========================
        if args.wd:
            self.wd = args.wd
        else:
            temp = input("Working directory: ")
            self.wd = temp

        print("\nUsing working directory '" + self.wd + "' ...")

        ## Define transcription file ========================
        if args.file:
            tempfile = args.file
        else:
            tempfile = input("\nInput your working file name. \nWorking file should be in your stated working directory:")

        filedir = os.path.join(self.wd, tempfile)
        print("\nFile directory will be '" + filedir + "'")


        ## Define id column ========================
        ##todo## need to check if ID column is in the file
        if args.col_id:
            self.col_id = args.col_id
        else:
            self.col_id = input("\nDefine the column name specifying unique IDs (e.g., specimen ID):")

        print("\nColumn name that specifies unique ID is " + self.col_id)

        ## Define target columns ========================
        ##todo## need to check if target columns are in the file
        targetlist = []
        methodlist = []

        if args.col_target and args.col_method:
            temp_target = args.col_target.strip("[|]").split(",")
            temp_method = args.col_method.strip("[|]").split(",")
            if len(temp_target) > 0 and len(temp_method) > 0 and len(temp_method) == len(temp_target):
                self.col_target = temp_target
                self.col_method = temp_method

        else:
            while True:
                temp_target = input("\nDefine the column name to be resolved:\n (Enter nothing to continue to the next step)")
                temp_method = input("\nPlease define the method for which you would like to use on this column: \n(Enter nothing to continue to the next step)")

                if(temp_method == "" or temp_target == ""):
                    break

                targetlist.append(temp_target)
                methodlist.append(temp_target)

            self.col_target = targetlist
            self.col_method = methodlist

        [print("\nUsing method", y, "for column", x) for x, y in zip(self.col_target, self.col_method)]

        ## Import file ========================
        allcols = copy.copy(self.col_target) # make a copy so we don't alter self.col_target
        allcols.append(self.col_id)

        self.file = pd.read_csv(filedir, dtype=object,\
                                encoding = "ISO-8859-1",
                                usecols = allcols) # only use columns that were supplied
        self.file = self.file.fillna("") # Converts all NaNs into empty strings for alignment


## MAIN ##
def main():
    args = parser.parse_args()
    if args.version:
        print("v1.0")
    elif args.manual:
        webbrowser.open("https://github.com/junyinglim/TranscriptResolver")
    else:
        print("\n\n\n")
        print("=" * 50)
        print("WELCOME TO TRANSCRIPT RESOLVER!!")
        print("Let's resolve some replicate transcripts! \n")
        print("Please visit https://github.com/junyinglim/Notes-from-Nature \nfor a short explanation of the transcript resolution methods available \n")
        print("This crude program was written by Jun Ying Lim (junyinglim@gmail.com) \nfor the Essig Museum of Entomology at UC Berkeley")
        print("=" * 50)
        print("\n\n\n")

    ## Startup
    currentArgs = transcriptResolver(args)

    # Create empty list
    results = []
    for col_no in range(len(currentArgs.col_target)):
        if currentArgs.col_method[col_no] == "vote_count":
            df = vote_count(accession = currentArgs.col_id,\
                            field = currentArgs.col_target[col_no],\
                            data = currentArgs.file)
        elif currentArgs.col_method[col_no] == "consensus":
            df = variant_consensus(accession = currentArgs.col_id,\
                                   field = currentArgs.col_target[col_no],\
                                   align_method = "character",\
                                   consensus_method = "dumber",\
                                   wdir = currentArgs.wd,\
                                   data = currentArgs.file)
        elif currentArgs.col_method[col_no] == "metadata":
            df = metadata_handling(accession = currentArgs.col_id,\
                                   field = currentArgs.col_target[col_no],\
                                   data = currentArgs.file)
        else:
            ##todo## write a proper error handling here
            print("Sorry, method supplied is not valid")

        # Add data frame to the results list
        results.append(df)

    # Merge results
    allResults = reduce(lambda a, d: pd.merge(a, d, on = currentArgs.col_id), results)

    finalDir = os.path.join(currentArgs.wd, currentArgs.stem + "transcript.csv")
    allResults.to_csv(finalDir, index = False)
    print("\nExporting results to", finalDir)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="phyloGenerator - phylogeny generation for ecologists.", epilog="Help at http://willpearse.github.com/phyloGenerator - written by Will Pearse")
    parser.add_argument("--version", action="store_true", help="Display version information.")
    parser.add_argument("--manual", action="store_true", help="(Attempt to) open browser and show help")
    parser.add_argument("-stem", "-n", help="'Stem' name for all output files.") # for command line
    parser.add_argument("-wd", help = "Working directory")
    parser.add_argument("-file", "-f", help = "File with transcriptions")
    parser.add_argument("-col_id", help = "List of columns to be resolved")
    parser.add_argument("-col_target", help = "Target column. Must be in the format -col_target [target1,target2,target3]")
    parser.add_argument("-col_method", help = "Method. Must be in the format -col_method [method1,method2,method3]")
    main()

##todo## logging the results