-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscriptResolver.py
More file actions
164 lines (130 loc) · 7.15 KB
/
transcriptResolver.py
File metadata and controls
164 lines (130 loc) · 7.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import csv
import os
import itertools as it
import numpy as np
import re # regular expressions
import argparse #For command line arguments
from consensus_tools import * # custom functions to run transcript resolving
from collections import defaultdict # utility functions to create dictionaries
import pandas as pd # data frame functionality
from fuzzywuzzy import process, fuzz # Functions that are useful for fuzzy string matching (https://github.com/seatgeek/fuzzywuzzy)
from functools import reduce # for the reduce function
import webbrowser
import copy
class transcriptResolver:
def __init__(self, args): # __init__ always run when an instance of the class is created
## Define stem name ========================
if args.stem:
self.stem = args.stem
else:
print("\nPlease input a 'stem' name to act as a prefix to all output (e.g., stemname_calbug.csv)")
self.stem = input("Stem name: ")
self.stem = self.stem + "_"
print("\nStem name for all outputs will be '" + self.stem + "'")
## Define working directory ========================
if args.wd:
self.wd = args.wd
else:
temp = input("Working directory: ")
self.wd = temp
print("\nUsing working directory '" + self.wd + "' ...")
## Define transcription file ========================
if args.file:
tempfile = args.file
else:
tempfile = input("\nInput your working file name. \nWorking file should be in your stated working directory:")
filedir = os.path.join(self.wd, tempfile)
print("\nFile directory will be '" + filedir + "'")
## Define id column ========================
##todo## need to check if ID column is in the file
if args.col_id:
self.col_id = args.col_id
else:
self.col_id = input("\nDefine the column name specifying unique IDs (e.g., specimen ID):")
print("\nColumn name that specifies unique ID is " + self.col_id)
## Define target columns ========================
##todo## need to check if target columns are in the file
targetlist = []
methodlist = []
if args.col_target and args.col_method:
temp_target = args.col_target.strip("[|]").split(",")
temp_method = args.col_method.strip("[|]").split(",")
if len(temp_target) > 0 and len(temp_method) > 0 and len(temp_method) == len(temp_target):
self.col_target = temp_target
self.col_method = temp_method
else:
while True:
temp_target = input("\nDefine the column name to be resolved:\n (Enter nothing to continue to the next step)")
temp_method = input("\nPlease define the method for which you would like to use on this column: \n(Enter nothing to continue to the next step)")
if(temp_method == "" or temp_target == ""):
break
targetlist.append(temp_target)
methodlist.append(temp_target)
self.col_target = targetlist
self.col_method = methodlist
[print("\nUsing method", y, "for column", x) for x, y in zip(self.col_target, self.col_method)]
## Import file ========================
allcols = copy.copy(self.col_target) # make a copy so we don't alter self.col_target
allcols.append(self.col_id)
self.file = pd.read_csv(filedir, dtype=object,\
encoding = "ISO-8859-1",
usecols = allcols) # only use columns that were supplied
self.file = self.file.fillna("") # Converts all NaNs into empty strings for alignment
## MAIN ##
def main():
args = parser.parse_args()
if args.version:
print("v1.0")
elif args.manual:
webbrowser.open("https://github.com/junyinglim/TranscriptResolver")
else:
print("\n\n\n")
print("=" * 50)
print("WELCOME TO TRANSCRIPT RESOLVER!!")
print("Let's resolve some replicate transcripts! \n")
print("Please visit https://github.com/junyinglim/Notes-from-Nature \nfor a short explanation of the transcript resolution methods available \n")
print("This crude program was written by Jun Ying Lim (junyinglim@gmail.com) \nfor the Essig Museum of Entomology at UC Berkeley")
print("=" * 50)
print("\n\n\n")
## Startup
currentArgs = transcriptResolver(args)
# Create empty list
results = []
for col_no in range(len(currentArgs.col_target)):
if currentArgs.col_method[col_no] == "vote_count":
df = vote_count(accession = currentArgs.col_id,\
field = currentArgs.col_target[col_no],\
data = currentArgs.file)
elif currentArgs.col_method[col_no] == "consensus":
df = variant_consensus(accession = currentArgs.col_id,\
field = currentArgs.col_target[col_no],\
align_method = "character",\
consensus_method = "dumber",\
wdir = currentArgs.wd,\
data = currentArgs.file)
elif currentArgs.col_method[col_no] == "metadata":
df = metadata_handling(accession = currentArgs.col_id,\
field = currentArgs.col_target[col_no],\
data = currentArgs.file)
else:
##todo## write a proper error handling here
print("Sorry, method supplied is not valid")
# Add data frame to the results list
results.append(df)
# Merge results
allResults = reduce(lambda a, d: pd.merge(a, d, on = currentArgs.col_id), results)
finalDir = os.path.join(currentArgs.wd, currentArgs.stem + "transcript.csv")
allResults.to_csv(finalDir, index = False)
print("\nExporting results to", finalDir)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="phyloGenerator - phylogeny generation for ecologists.", epilog="Help at http://willpearse.github.com/phyloGenerator - written by Will Pearse")
parser.add_argument("--version", action="store_true", help="Display version information.")
parser.add_argument("--manual", action="store_true", help="(Attempt to) open browser and show help")
parser.add_argument("-stem", "-n", help="'Stem' name for all output files.") # for command line
parser.add_argument("-wd", help = "Working directory")
parser.add_argument("-file", "-f", help = "File with transcriptions")
parser.add_argument("-col_id", help = "List of columns to be resolved")
parser.add_argument("-col_target", help = "Target column. Must be in the format -col_target [target1,target2,target3]")
parser.add_argument("-col_method", help = "Method. Must be in the format -col_method [method1,method2,method3]")
main()
##todo## logging the results