-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtree.py
More file actions
329 lines (303 loc) · 19.1 KB
/
tree.py
File metadata and controls
329 lines (303 loc) · 19.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/python
########################################################################################
# A script to do many Newick tree related things
#
# Dependencies: core
#
# Gregg Thomas, Summer 2017
########################################################################################
import sys, os, random, argparse
sys.path.append(sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/lib/"))
import core, treeparse as tp, treelib as tree
####################
parser = argparse.ArgumentParser(description="A script to do many Newick tree related things");
parser.add_argument("-i", dest="input", help="A directory containing many Newick tree files or a single file containing Newick tree(s).", default=False);
parser.add_argument("-o", dest="output", help="Desired output location. If input is a file this should be a file, if a directory this will be a directory.", default=False);
parser.add_argument("--sep", dest="tree_sep", help="Given a single file with many Newick trees (one per line), this will write them all to separate files in the output directory.", action="store_true");
parser.add_argument("--join", dest="tree_join", help="Given an input directory with many Newick tree files, this will combine them all into a single file.", action="store_true");
parser.add_argument("--label", dest="label_tree", help="Given an input file or tree string, this will add internal node labels to the tree(s).", action="store_true");
parser.add_argument("--rootcheck", dest="root_check", help="Given an input file or tree string, this will check if the tree(s) are rooted or not.", action="store_true");
parser.add_argument("--root", dest="root_tree", help="Given an input file or tree string, this will root the tree with the specified outgroup(s).", action="store_true");
parser.add_argument("--rootbest", dest="root_tree_best", help="This option does the same thing as 'root' but on a 'best-trees.txt' file from wrappers.py --raxml. This file includes a column on each line with the source alignment file name.", action="store_true");
parser.add_argument("--concordance", dest="fotc", help="Given an input ROOTED species tree and a file containing many single-copy ROOTED gene trees this module will calculate concordance factors for each node in the species tree. Use -genetrees for the input gene tree file and -i for the input species tree file or string.", action="store_true");
parser.add_argument("--tipcount", dest="count_tips", help="Given a file with many trees, simply count the number of unique tip labels in all trees.", action="store_true");
parser.add_argument("--cladecount", dest="count_clade", help="Given a file with many trees and a list of tips defined with -clade, count the number of trees in which those labels form a monophyletic clade.", action="store_true");
parser.add_argument("--relabeltips", dest="relabel", help="Given a file with many trees and a set of labels defined by -labels, this will relabel tip nodes. Use -m to decide placement of new label, and -delim to enter delimiting character.", action="store_true");
parser.add_argument("--rmlabels", dest="rmlabel", help="Given a file with many trees with the internal nodes labeled, this will write a file with the same trees but WITHOUT internal nodes labeled.", action="store_true");
parser.add_argument("--rmlabelsbest", dest="rmlabel_best", help="Given a best-trees.txt file from a --raxml run from wrappers, this will remove the bootstrap labels on the internal nodes.", action="store_true");
parser.add_argument("--scale", dest="scale", help="Scale the branch lengths of the input tree(s) by an operation and value given. For example, enter '/2' to divide all branch lengths by 2, or '*5' to multiply all branch lengths by 5.", default=False);
parser.add_argument("--rf", dest="rf", help="Given an input UNROOTED species tree and a file containing many single-copy UNROOTED gene trees this module will call RAxML to calculate Robinson-Foulds distance for each gene tree to the species tree. Use -genetrees for the input gene tree file and -i for the input species tree file or string.", action="store_true");
parser.add_argument("-prefix", dest="file_prefix", help="For --sep, a string that will be used as the base file name for each output file.", default=False);
parser.add_argument("-outgroup", dest="outgroup", help="For --root, a comma separated list of tip labels common between trees to use as the outgroup for rooting", default=False);
parser.add_argument("-genetrees", dest="genetrees", help="For --concordance, this is the file containing the gene trees.", default=False);
parser.add_argument("--count", dest="count_tops", help="For --concordance. If set, the module will print out the number of times each topology was found.", action="store_true", default=False);
parser.add_argument("-labels", dest="labels", help="For --relabeltip, the old label and the newlabel in the format: \"old1,new1 old2,new2\". Old labels don't need to match exactly with existing labels to allow for matching substrings.", default=False);
parser.add_argument("-m", dest="run_mode", help="Run mode for --rmlabels. 1 (default): Remove only internal node labels; 2: remove only branch lengths; 3: remove internal node labels and branch lengths. For --relabeltips, 1 (default): Replace old label with new label; 2: Add new label to beginning of old label; 3: Add new label to end of old label.", type=int, default=1);
parser.add_argument("-delim", dest="delim", help="For --relabeltips, with run modes 2 and 3 this is the character that will be placed between the old and new label. Underscore (_) is default. Enter 'space' for space character.", default="_");
parser.add_argument("-raxpath", dest="raxpath", help="For --rf, the full path to your RAxML executable. By default, will simply try 'raxml'.", default="raxml");
parser.add_argument("-clade", dest="clade", help="For --cladecount, a comma separated list of tip labels.", default=False);
args = parser.parse_args();
# Input option definitions.
if not args.input or not os.path.exists(args.input):
if args.label_tree or args.root_check or args.root_tree or args.fotc:
file_flag = False;
tree_flag = True;
elif not os.path.exists(args.input):
sys.exit(core.errorOut(1, "-i must be specified and must be a valid file or directory name."));
else:
if os.path.isfile(args.input):
file_flag = True;
tree_flag = False;
filelist = [os.path.abspath(args.input)];
else:
file_flag = False;
tree_flag = False;
filelist_init = os.listdir(args.input);
filelist = [os.path.abspath(os.path.join(args.input, f)) for f in filelist_init];
# This checks if the input (-i) entered is valid. If so, it parses it as either a directory or a single file.
if not args.output and not tree_flag:
print("\n** Warning -- No output location specified. Will be determined automatically.");
pad = 40;
if args.tree_sep:
if not file_flag:
sys.exit(core.errorOut(2, "--sep only works on an input FILE!"));
output, outnum = core.defaultOutDir(args.input, file_flag, "sep", args.output);
print("* Making output directory...");
os.system("mkdir " + output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Separating trees in:", pad), args.input);
print(core.spacedOut("Writing output to:", pad), output);
print("-------------------------");
tree.treeSep(filelist[0], args.file_prefix, output);
sys.exit();
# --sep : takes an input FILE with many trees and puts each tree in its own file.
if args.tree_join:
if file_flag:
sys.exit(core.errorOut(3, "--join only works on an input DIRECTORY containing many tree files!"));
output, outnum = core.defaultOutFile(args.input, file_flag, "join", args.output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Joining trees in:", pad), args.input);
print(core.spacedOut("Writing output to:", pad), output);
print("-------------------------");
tree.treeJoin(filelist, output);
sys.exit();
# --join : takes an input DIRECTORY with many files and combines all trees from all files into a single file.
if args.label_tree:
if file_flag == False and tree_flag == False:
sys.exit(core.errorOut(4, "--label only works on an input FILE containing many trees or a TREE STRING."));
if tree_flag:
try:
td, t, root = tp.treeParse(args.input);
filelist = [td,t,root];
# Weird hack.
output = "";
except:
sys.exit(core.errorOut(5, "Couldn't read the input string as a Newick tree!"));
# If the input is not a file, check to see if it's a Newick string.
else:
output, outnum = core.defaultOutFile(args.input, file_flag, "label", args.output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Labeling all trees in:", pad), args.input);
print(core.spacedOut("Writing labeled trees to:", pad), output);
tree.labelTree(filelist, tree_flag, output);
sys.exit();
# --label : takes an input Newick string or file and puts labels on the internal nodes.
if args.root_check:
print("\n** Warning -- The root check module works on the basis that an unrooted tree has a trifurcation at the 'root.' So if your tree has other non-bifurcating nodes this will not give reliable results!");
print("\n Only use this module with bifurcating trees!");
if file_flag == False and tree_flag == False:
sys.exit(core.errorOut(6, "--rootcheck only works on an input FILE containing many trees or a TREE STRING."));
if tree_flag:
try:
td, t, root = tp.treeParse(args.input);
filelist = [td,t,root];
# Weird hack.
output = "";
except:
sys.exit(core.errorOut(7, "Couldn't read the input string as a Newick tree!"));
# If the input is not a file, check to see if it's a Newick string.
else:
output, outnum = core.defaultOutFile(args.input, file_flag, "rootcheck", args.output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Checking all trees in:", pad), args.input);
print(core.spacedOut("Writing output to:", pad), output);
tree.rootCheck(filelist, tree_flag, output);
sys.exit();
# --rootcheck : takes an input Newick string or file and checks if the trees are rooted or not.
if args.root_tree:
print("\n** Warning -- The root option relies on the software Newick Utilities. If you don't have this installed and in your PATH variable, you will see an error!");
if file_flag == False and tree_flag == False:
sys.exit(core.errorOut(8, "--root only works on an input FILE containing many trees or a TREE STRING."));
if not args.outgroup:
sys.exit(core.errorOut(9, "With --root, a set of tip labels must be specified as the desired outgroup (-outgroup)."));
if tree_flag:
try:
td, t, root = tp.treeParse(args.input);
filelist = [td,t,root,args.input];
# Weird hack.
output = "";
except:
sys.exit(core.errorOut(10, "Couldn't read the input string as a Newick tree!"));
# If the input is not a file, check to see if it's a Newick string.
else:
output, outnum = core.defaultOutFile(args.input, file_flag, "reroot", args.output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Re-rooting all trees in:", pad), args.input);
print(core.spacedOut("Writing rooted trees to:", pad), output);
tree.rootTrees(filelist, tree_flag, args.outgroup, output);
sys.exit();
# --root : takes an input Newick string or file and roots or re-roots the trees using Newick Utilities and the specified outgroups.
if args.root_tree_best:
print("\n** Warning -- The rootbest option relies on the software Newick Utilities. If you don't have this installed and in your PATH variable, you will see an error!");
if file_flag == False:
sys.exit(core.errorOut(8.1, "--rootbest only works on the best-trees.txt FILE from wrappers.py --raxml containing many rows, each with the source alignment filename and the tree separated by a tab."));
if not args.outgroup:
sys.exit(core.errorOut(9.1, "With --root, a set of tip labels must be specified as the desired outgroup (-outgroup)."));
output, outnum = core.defaultOutFile(args.input, file_flag, "reroot", args.output);
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Re-rooting all trees in:", pad), args.input);
print(core.spacedOut("Writing rooted trees to:", pad), output);
tree.rootTreesBest(filelist, tree_flag, args.outgroup, output);
sys.exit();
# --root : takes an input Newick string or file and roots or re-roots the trees using Newick Utilities and the specified outgroups.
if args.fotc:
if file_flag == False and tree_flag == False:
sys.exit(core.errorOut(11, "--concordance only works on an input FILE containing a rooted tree or a rooted TREE STRING."));
if not os.path.isfile(args.genetrees):
sys.exit(core.errorOut(12, "-genetrees must be a valid file name!"));
else:
args.genetrees = os.path.abspath(args.genetrees);
# Check if the input files are valid.
if tree_flag:
try:
td, t, root = tp.treeParse(args.input);
filelist = [td,t,root,args.input];
# Weird hack.
except:
sys.exit(core.errorOut(13, "Couldn't read the input string as a Newick tree!"));
# If the input is not a file, check to see if it's a Newick string.
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print("Calculating concordance factors for your species tree.");
print(core.spacedOut("Using gene trees in:", pad), args.genetrees);
print("Simply printing output to the screen");
tree.flightOfTheConcordance(filelist, tree_flag, args.genetrees, args.count_tops);
sys.exit();
# --concordance : takes an input species tree (Newick string or file) and single-copy gene trees (file)
# and calculates concordance factors for each internal node of the species tree.
if args.count_tips:
if not file_flag:
sys.exit(core.errorOut(14, "--tipcount takes an input (-i) FILE only."));
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Counting tips in:", pad), args.input);
tree.countTips(filelist[0]);
sys.exit();
# --tipcount : counts all unique tips in an input file with many trees.
if args.count_clade:
if not file_flag:
sys.exit(core.errorOut(15, "--cladecount takes an input (-i) FILE only."));
try:
clade = set(args.clade.split(","));
except:
sys.exit(core.errorOut(16, "--clade must be a comma separate list of tips."));
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Counting clades in:", pad), args.input);
print(core.spacedOut("Clade tips:", pad), args.clade);
tree.countClade(filelist[0], clade);
sys.exit();
# --tipcount : counts all unique tips in an input file with many trees.
if args.relabel:
if not file_flag:
sys.exit(core.errorOut(17, "--relabeltips takes an input (-i) FILE only."));
if not args.labels:
sys.exit(core.errorOut(18, "-labels must be entered with --relabeltips"));
if args.run_mode not in [1,2,3]:
sys.exit(core.errorOut(16, "-m must take values of 1, 2, or 3."));
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Relabeling tips in:", pad), args.input);
output, outnum = core.defaultOutFile(args.input, file_flag, "relabel", args.output);
if args.run_mode == 1:
print("Replacing old labels");
if args.run_mode == 2:
print("Adding new labels to beginning of old labels.");
if args.run_mode == 3:
print("Adding new labels to end of old labels.");
if args.run_mode in [2,3]:
print(core.spacedOut("Using delimiter:", pad), args.delim);
if args.delim == 'space':
args.delim = ' ';
print(core.spacedOut("Writing output to:", pad), output);
tree.relabelTips(filelist[0], args.labels, args.run_mode, args.delim, output);
sys.exit();
# --relabeltips : in a file containing many trees, relabels all tips containing an old label with a new label specified by user
if args.rmlabel or args.rmlabel_best:
if not file_flag:
sys.exit(core.errorOut(19, "--rmlabels takes an input (-i) FILE only."));
if args.run_mode not in [1,2,3]:
sys.exit(core.errorOut(20, "-m must take values of 1, 2, or 3."));
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Removing internal node labels from:", pad), args.input);
output, outnum = core.defaultOutFile(args.input, file_flag, "rmlabel", args.output);
print(core.spacedOut("Writing output to:", pad), output);
if args.rmlabel:
tree.rmLabel(filelist[0], args.run_mode, output);
elif args.rmlabel_best:
tree.rmLabel(filelist[0], args.run_mode, output, True);
sys.exit();
# --rmlabel : in a file containing one or more tree, this removes any internal node labels in the tree(s)
if args.scale != False:
scale_op, scale_factor = args.scale[0], args.scale[1:];
if scale_op not in ["/","*","+","-"]:
sys.exit(core.errorOut(21, "The first charcater of --scale must be /, *, +, or -."));
try:
scale_factor = float(scale_factor);
except:
sys.exit(core.errorOut(22, "The scale factor must be an integer or a float."));
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print(core.spacedOut("Scaling branch lengths on trees in:", pad), args.input);
output, outnum = core.defaultOutFile(args.input, file_flag, "scale", args.output);
print(core.spacedOut("Writing output to:", pad), output);
tree.scaleBL(filelist[0], scale_op, scale_factor, output);
sys.exit();
if args.rf:
#if not os.path.exists(args.raxpath):
# sys.exit(core.errorOut(22, "Path to RAxML (" + args.raxpath + ") not found!"));
if args.raxpath:
print("User specified path to RAxML:", args.raxpath);
else:
print("No path to RAxML specified. Trying 'raxml'");
# Checking the input raxpath.
if file_flag == False and tree_flag == False:
sys.exit(core.errorOut(23, "--rf only works on an input FILE containing an unrooted tree or an unrooted TREE STRING."));
if not os.path.isfile(args.genetrees):
sys.exit(core.errorOut(24, "-genetrees must be a valid file name!"));
else:
args.genetrees = os.path.abspath(args.genetrees);
# Check if the input files are valid.
if tree_flag:
try:
td, t, root = tp.treeParse(args.input);
filelist = [td,t,root,args.input];
# Weird hack.
except:
sys.exit(core.errorOut(25, "Couldn't read the input string as a Newick tree!"));
# If the input is not a file, check to see if it's a Newick string.
print("=======================================================================");
print("\t\t\t" + core.getDateTime());
print("Calculating Robinson-Foulds distance for each gene tree to the species tree.");
print(core.spacedOut("Using gene trees in:", pad), args.genetrees);
output, outnum = core.defaultOutFile(args.genetrees, file_flag, "RF", args.output);
print(core.spacedOut("Writing output to:", pad), output);
tree.robF(filelist, tree_flag, args.genetrees, args.raxpath, output);
sys.exit();