|
1 | 1 | import sys
|
| 2 | +import hashlib |
| 3 | +import yaml |
| 4 | + |
| 5 | +#configfi=str(sys.argv[sys.argv.index("--configfile")+1]) |
| 6 | +configfi="data/config.yaml" |
| 7 | + |
| 8 | +###all kinds of functions related to the hash functionality |
| 9 | + |
| 10 | +def get_hash(previous, string_of_dict_paths, yamlfile): |
| 11 | + import collections |
| 12 | + print("### GET HASH: "+string_of_dict_paths,yamlfile+" ###") |
| 13 | + dict_to_hash = {} |
| 14 | + with open(yamlfile) as f: |
| 15 | + my_dict = yaml.safe_load(f) |
| 16 | + dic_list = string_of_dict_paths.split(" ") |
| 17 | + for d in dic_list: |
| 18 | + print(d) |
| 19 | + l = d.split(",") |
| 20 | + if len(l) == 2: |
| 21 | + print("level 2") |
| 22 | + if not l[0] in dict_to_hash: |
| 23 | + dict_to_hash[l[0]] = {l[1]: str(my_dict[l[0]][l[1]])} |
| 24 | + elif not l[1] in dict_to_hash[l[0]]: |
| 25 | + dict_to_hash[l[0]][l[1]] = str(my_dict[l[0]][l[1]]) |
| 26 | + if len(l) == 3: |
| 27 | + print("level 3") |
| 28 | + if not l[0] in dict_to_hash: |
| 29 | + dict_to_hash[l[0]] = {l[1]: {l[2]: str(my_dict[l[0]][l[1]][l[2]])}} |
| 30 | + elif not l[1] in dict_to_hash[l[0]]: |
| 31 | + dict_to_hash[l[0]][l[1]] = {l[2]: str(my_dict[l[0]][l[1]][l[2]])} |
| 32 | + elif not l[2] in dict_to_hash[l[0]][l[1]]: |
| 33 | + dict_to_hash[l[0]][l[1]][l[2]] = str(my_dict[l[0]][l[1]][l[2]]) |
| 34 | + |
| 35 | + print("FINAL: "+str(dict_to_hash)) |
| 36 | + ordered = collections.OrderedDict(dict_to_hash) |
| 37 | + print(str(ordered)) |
| 38 | + combined = str(previous+str(ordered)) |
| 39 | + print(combined) |
| 40 | + hash = hashlib.shake_256(combined.encode()).hexdigest(5) |
| 41 | + print(hash) |
| 42 | + print("### DONE HASH ###") |
| 43 | + return hash |
| 44 | + |
| 45 | + |
| 46 | + |
| 47 | +def collect_hashes(mode): |
| 48 | + hashes = {} |
| 49 | + |
| 50 | + #orthology |
| 51 | + hashes['orthology'] = get_hash("", "orthology,method orthology,busco_options,set orthology,busco_options,version orthology,busco_options,mode orthology,busco_options,augustus_species orthology,busco_options,additional_parameters", configfi) |
| 52 | + if mode == "orthology": |
| 53 | + print("Gathered hashes until 'orthology'") |
| 54 | + print(hashes['orthology']) |
| 55 | + return hashes |
| 56 | + |
| 57 | + #filter-orthology |
| 58 | + hashes['filter-orthology'] = get_hash(hashes['orthology'], "filtering,dupseq filtering,cutoff filtering,minsp filtering,seq_type filtering,exclude_orthology", configfi) |
| 59 | + |
| 60 | + if mode == "filter-orthology": |
| 61 | + print("Gathered hashes until 'filter-orthology'") |
| 62 | + print(hashes['filter-orthology']) |
| 63 | + return hashes |
| 64 | + |
| 65 | + #align |
| 66 | + hashes['align'] = {} |
| 67 | + for a in config["alignment"]["method"]: |
| 68 | + hashes['align'][a] = get_hash(hashes['filter-orthology'], "alignment,options,"+a, configfi) |
| 69 | + |
| 70 | + if mode == "align": |
| 71 | + print("Gathered hashes until 'align'") |
| 72 | + print(hashes['align']) |
| 73 | + return hashes |
| 74 | + |
| 75 | + #filter-alignment |
| 76 | + hashes['filter-align'] = {} |
| 77 | + for t in config["trimming"]["method"]: |
| 78 | + hashes['filter-align'][t] = {} |
| 79 | + for a in hashes['align'].keys(): |
| 80 | + if os.path.isfile("results/alignments/full/"+a+"."+hashes['align'][a]+"/parameters.yaml"): |
| 81 | + hashes['filter-align'][t][a] = get_hash(hashes['align'][a], "trimming,options,"+t+" trimming,min_parsimony_sites", configfi) |
| 82 | + else: |
| 83 | + print("Please doublecheck if the stage 'align' was run with the parameters currently specified in "+configfi) |
| 84 | + sys.exit() |
| 85 | + |
| 86 | + if mode == "filter-align": |
| 87 | + print("Gathered hashes until 'filter-align'") |
| 88 | + print(hashes['filter-align']) |
| 89 | + return hashes |
| 90 | + |
| 91 | + #modeltest |
| 92 | + hashes['modeltest'] = {} |
| 93 | + for m in config["modeltest"]["method"]: |
| 94 | + hashes['modeltest'][m] = {} |
| 95 | + for t in config["trimming"]["method"]: |
| 96 | + hashes['modeltest'][m][t] = {} |
| 97 | + for a in hashes['align'].keys(): |
| 98 | + if os.path.isfile("results/alignments/parameters."+a+"-"+t+"."+hashes['filter-align'][t][a]+".yaml"): |
| 99 | + hashes['modeltest'][m][t][a] = get_hash(hashes['filter-align'][t][a], "modeltest,options,"+m+" modeltest,bootstrap", configfi) |
| 100 | + else: |
| 101 | + print("Please doublecheck if the stage 'filter-align' was run with the parameters currently specified in "+configfi) |
| 102 | + sys.exit() |
| 103 | + |
| 104 | + if mode == "modeltest": |
| 105 | + print("Gathered hashes until 'modeltest'") |
| 106 | + print(hashes['modeltest']) |
| 107 | + return hashes |
| 108 | + |
| 109 | + hashes['tree_inference'] = {} |
| 110 | + #speciestree |
| 111 | + if mode == "speciestree": |
| 112 | + for i in config["speciestree"]["method"]: |
| 113 | + hashes['tree_inference'][i] = {} |
| 114 | + for m in hashes['modeltest'].keys(): |
| 115 | + hashes['tree_inference'][i][m] = {} |
| 116 | + for t in hashes['filter-align'].keys(): |
| 117 | + hashes['tree_inference'][i][m][t] = {} |
| 118 | + for a in hashes['align'].keys(): |
| 119 | + if os.path.isfile("results/modeltest/parameters."+a+"-"+t+"."+hashes['modeltest'][m][t][a]+".yaml"): |
| 120 | + hashes['tree_inference'][i][m][t][a] = get_hash(hashes['modeltest'][m][t][a], "speciestree,options,"+i+" speciestree,include", configfi) |
| 121 | + else: |
| 122 | + print("Please doublecheck if the stage 'modeltest' was run with the parameters currently specified in "+configfi) |
| 123 | + sys.exit() |
| 124 | + |
| 125 | + print("Gathered hashes until 'speciestree'") |
| 126 | + print(hashes['tree_inference']) |
| 127 | + return hashes |
| 128 | + |
| 129 | + ################################### |
| 130 | + #mltree |
| 131 | + if mode == "mltree": |
| 132 | + for i in config["mltree"]["method"]: |
| 133 | + hashes['tree_inference'][i] = {} |
| 134 | + for m in hashes['modeltest'].keys(): |
| 135 | + hashes['tree_inference'][i][m] = {} |
| 136 | + for t in hashes['filter-align'].keys(): |
| 137 | + hashes['tree_inference'][i][m][t] = {} |
| 138 | + for a in hashes['align'].keys(): |
| 139 | + if os.path.isfile("results/modeltest/parameters."+a+"-"+t+"."+hashes['modeltest'][m][t][a]+".yaml"): |
| 140 | + hashes['tree_inference'][i][m][t][a] = get_hash(hashes['modeltest'][m][t][a], "speciestree,options,"+i+" speciestree,include", configfi) |
| 141 | + else: |
| 142 | + print("Please doublecheck if the stage 'modeltest' was run with the parameters currently specified in "+configfi) |
| 143 | + sys.exit() |
| 144 | + |
| 145 | + print("Gathered hashes until 'mltree'") |
| 146 | + print(hashes['mltree']) |
| 147 | + return hashes |
| 148 | + |
| 149 | + |
| 150 | +def trigger(current_yaml, config_yaml): |
| 151 | + #the function triggers the start of a submodule if parameter files aren't there or when there are differences to the config file |
| 152 | + if os.path.isfile(current_yaml): |
| 153 | +# print("Reading in file") |
| 154 | + with open(current_yaml) as f: |
| 155 | + my_dict = yaml.safe_load(f) |
| 156 | + print(str(my_dict)) |
| 157 | + if find_top(my_dict, config): |
| 158 | + print("ALL GOOD") |
| 159 | + return "None" |
| 160 | + else: |
| 161 | + return config_yaml |
| 162 | + else: |
| 163 | + print("File "+current_yaml+" is not there") |
| 164 | + return config_yaml |
| 165 | + |
| 166 | + |
| 167 | +def get_all_keys(d): |
| 168 | + #finds all keys in a nested dictionary |
| 169 | + outlist=[] |
| 170 | + for key, value in d.items(): |
| 171 | + yield key |
| 172 | + if isinstance(value, dict): |
| 173 | + yield from get_all_keys(value) |
| 174 | + |
| 175 | +def search(d, k, path=None): |
| 176 | + #finds the path to all keys in a nested dictionary |
| 177 | + if path is None: |
| 178 | + path = [] |
| 179 | + |
| 180 | + # Reached bottom of dict - no good |
| 181 | + if not isinstance(d, dict): |
| 182 | + return False |
| 183 | + |
| 184 | + # Found it! |
| 185 | + if k in d.keys(): |
| 186 | + path.append(k) |
| 187 | + return path |
| 188 | + |
| 189 | + else: |
| 190 | + check = list(d.keys()) |
| 191 | + # Look in each key of dictionary |
| 192 | + while check: |
| 193 | + first = check[0] |
| 194 | + # Note which we just looked in |
| 195 | + path.append(first) |
| 196 | + if search(d[first], k, path) is not False: |
| 197 | + break |
| 198 | + else: |
| 199 | + # Move on |
| 200 | + check.pop(0) |
| 201 | + path.pop(-1) |
| 202 | + else: |
| 203 | + return False |
| 204 | + return path |
| 205 | + |
| 206 | +def find_top(t, against): |
| 207 | + #finds all terminal values and the paths in a nested dictionary and compares to a second dictionary |
| 208 | + #returns True if no difference |
| 209 | + #return False if difference |
| 210 | + for x in get_all_keys(t): |
| 211 | + path = search(t, x) |
| 212 | + if len(path) == 1: |
| 213 | + if not isinstance(t[path[0]], dict): |
| 214 | + print("1 - key: "+x+" - "+str(search(t, x))+" - "+t[path[0]]) |
| 215 | + if len(path) == 2: |
| 216 | + if not isinstance(t[path[0]][path[1]], dict): |
| 217 | + print("2 - key: "+x+" - "+str(search(t, x))+" - "+t[path[0]][path[1]]) |
| 218 | + print("config: "+str(against[path[0]][path[1]])) |
| 219 | + from_file = str(t[path[0]][path[1]]) |
| 220 | + from_config = str(against[path[0]][path[1]]) |
| 221 | + if from_file == from_config: |
| 222 | + print("EQUAL") |
| 223 | + else: |
| 224 | + print("NOT EQUAL") |
| 225 | + return False |
| 226 | + if len(path) == 3: |
| 227 | + if not isinstance(t[path[0]][path[1]][path[2]], dict): |
| 228 | + print("3 - key: "+x+" - "+str(search(t, x))+" - "+t[path[0]][path[1]][path[2]]) |
| 229 | + print("config: "+str(against[path[0]][path[1]][path[2]])) |
| 230 | + from_file = str(t[path[0]][path[1]][path[2]]) |
| 231 | + from_config = str(against[path[0]][path[1]][path[2]]) |
| 232 | + if from_file == from_config: |
| 233 | + print("EQUAL") |
| 234 | + else: |
| 235 | + print("NOT EQUAL") |
| 236 | + return False |
| 237 | + return True |
| 238 | + |
| 239 | + |
| 240 | +#### |
| 241 | +def get_input_genes(wildcards): |
| 242 | + bs_cutoff = int(wildcards.bootstrap) |
| 243 | + list_of_genes = [] |
| 244 | + with open("results/modeltest/genetree_filter_" + wildcards.aligner + "_" + wildcards.alitrim + "."+modeltest_hashes["iqtree"][wildcards.alitrim][wildcards.aligner]+".txt") as file: |
| 245 | + for line in file: |
| 246 | + gene = line.split("\t")[0] |
| 247 | + bs_value = int(line.strip().split("\t")[-1]) |
| 248 | + if bs_value >= bs_cutoff: |
| 249 | + list_of_genes.append(gene) |
| 250 | + return list_of_genes |
2 | 251 |
|
3 | 252 | def get_aligners():
|
4 | 253 | aligners = config["alignment"]["method"]
|
@@ -39,7 +288,7 @@ def get_trimmers():
|
39 | 288 | sys.exit(1)
|
40 | 289 |
|
41 | 290 | def get_treemethods():
|
42 |
| - trees = config["tree"]["method"] |
| 291 | + trees = config["mltree"]["method"] |
43 | 292 | if isinstance(trees, str):
|
44 | 293 | if ", " in trees:
|
45 | 294 | return trees.split(", ")
|
|
0 commit comments