@@ -16,7 +16,8 @@ filter_orthology_hash = hashes['filter-orthology']["global"]
16
16
aligner_hashes = hashes ['align' ]["per" ]
17
17
previous_hash = hashes ['align' ]["global" ]
18
18
current_hash = hashes ['filter-align' ]["global" ]
19
- trimmer_hashes = hashes ['filter-align' ]["per" ]
19
+ trimmed_hashes = hashes ['filter-align' ]["per-trimming" ]
20
+ filtered_hashes = hashes ['filter-align' ]["per" ]
20
21
21
22
BUSCOS , = glob_wildcards ("results/orthology/busco/busco_sequences_deduplicated." + filter_orthology_hash + "/{busco}_all.fas" )
22
23
@@ -27,7 +28,9 @@ def previous_params_per(wildcards):
27
28
return "results/alignments/full/" + wildcards .aligner + "." + aligner_hashes [wildcards .aligner ]+ "/parameters.align." + wildcards .aligner + "." + aligner_hashes [wildcards .aligner ]+ ".yaml"
28
29
29
30
def compare_filter_align (wildcards ):
30
- return [trigger ("results/alignments/trimmed/{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml" .format (aligner = wildcards .aligner , alitrim = wildcards .alitrim , hash = wildcards .hash ), configfi )]
31
+ return [trigger ("results/alignments/filtered/{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml" .format (aligner = wildcards .aligner , alitrim = wildcards .alitrim , hash = wildcards .hash ), configfi )]
32
+ def compare_trimmed_align (wildcards ):
33
+ return [trigger ("results/alignments/trimmed/{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml" .format (aligner = wildcards .aligner , alitrim = wildcards .alitrim , hash = wildcards .trimhash ), configfi )]
31
34
32
35
def per_aligner (wildcards ):
33
36
# print(wildcards)
@@ -37,7 +40,7 @@ def per_trimmer(wildcards):
37
40
lis = []
38
41
# print(wildcards)
39
42
for b in BUSCOS :
40
- lis .append ("results/alignments/trimmed/" + wildcards .aligner + "-" + wildcards .alitrim + "." + trimmer_hashes [wildcards .alitrim ][wildcards .aligner ]+ "/" + str (b )+ "_aligned_trimmed.fas" )
43
+ lis .append ("results/alignments/trimmed/" + wildcards .aligner + "-" + wildcards .alitrim + "." + trimmed_hashes [wildcards .alitrim ][wildcards .aligner ]+ "/" + str (b )+ "_aligned_trimmed.fas" )
41
44
return lis
42
45
43
46
def determine_concurrency_limit ():
@@ -58,21 +61,36 @@ batches = determine_concurrency_limit()
58
61
aligners = get_aligners ()
59
62
trimmers = get_trimmers ()
60
63
64
+ rule read_params_per_trimmer :
65
+ input :
66
+ trigger = compare_trimmed_align ,
67
+ previous = previous_params_per
68
+ output :
69
+ "results/alignments/trimmed/{aligner}-{alitrim}.{trimmhash}/parameters.filter-align.{aligner}-{alitrim}.{trimhash}.yaml"
70
+ params :
71
+ configfile = configfi
72
+ shell :
73
+ """
74
+ bin/read_write_yaml.py {input.trigger} {output} trimming,options,{wildcards.alitrim}
75
+ cat {input.previous} >> {output}
76
+ """
77
+
78
+
61
79
rule read_params_per :
62
80
input :
63
81
trigger = compare_filter_align ,
64
82
previous = previous_params_per
65
83
output :
66
- "results/alignments/trimmed /{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml"
84
+ "results/alignments/filtered /{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml"
67
85
shell :
68
86
"""
69
- bin/read_write_yaml.py {input.trigger} {output} trimming,options,{wildcards.alitrim} trimming,min_parsimony_sites
87
+ bin/read_write_yaml.py {input.trigger} {output} trimming,options,{wildcards.alitrim} trimming,min_parsimony_sites trimming,max_rcv_score
70
88
cat {input.previous} >> {output}
71
89
"""
72
90
73
91
rule read_params_global :
74
92
input :
75
- trigger = compare ("results/alignments/trimmed /parameters.filter-align." + current_hash + ".yaml" , configfi ),
93
+ trigger = compare ("results/alignments/filtered /parameters.filter-align." + current_hash + ".yaml" , configfi ),
76
94
previous = previous_params_global
77
95
output :
78
96
"results/alignments/trimmed/parameters.filter-align." + current_hash + ".yaml"
@@ -85,11 +103,11 @@ def return_aligner_checkpoint(wildcards):
85
103
return "results/checkpoints/" + wildcards .aligner + "_aggregate_align." + aligner_hashes [wildcards .aligner ]+ ".done"
86
104
87
105
def return_bmge_params (wildcards ):
88
- return "results/alignments/trimmed/" + wildcards .aligner + "-bmge." + trimmer_hashes ["bmge" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-bmge." + trimmer_hashes ["bmge" ][wildcards .aligner ]+ ".yaml"
106
+ return "results/alignments/trimmed/" + wildcards .aligner + "-bmge." + trimmed_hashes ["bmge" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-bmge." + trimmed_hashes ["bmge" ][wildcards .aligner ]+ ".yaml"
89
107
def return_trimal_params (wildcards ):
90
- return "results/alignments/trimmed/" + wildcards .aligner + "-trimal." + trimmer_hashes ["trimal" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-trimal." + trimmer_hashes ["trimal" ][wildcards .aligner ]+ ".yaml"
108
+ return "results/alignments/trimmed/" + wildcards .aligner + "-trimal." + trimmed_hashes ["trimal" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-trimal." + trimmed_hashes ["trimal" ][wildcards .aligner ]+ ".yaml"
91
109
def return_aliscore_params (wildcards ):
92
- return "results/alignments/trimmed/" + wildcards .aligner + "-aliscore." + trimmer_hashes ["aliscore" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-aliscore." + trimmer_hashes ["aliscore" ][wildcards .aligner ]+ ".yaml"
110
+ return "results/alignments/trimmed/" + wildcards .aligner + "-aliscore." + trimmed_hashes ["aliscore" ][wildcards .aligner ]+ "/parameters.filter-align." + wildcards .aligner + "-aliscore." + trimmed_hashes ["aliscore" ][wildcards .aligner ]+ ".yaml"
93
111
94
112
rule bmge :
95
113
input :
@@ -220,19 +238,21 @@ rule get_trimmed_statistics:
220
238
def pull_trimmer (wildcards ):
221
239
lis = []
222
240
for i in range (1 , config ["concurrency" ] + 1 ):
223
- lis .append ("results/statistics/trim-" + wildcards .aligner + "-" + wildcards .alitrim + "." + trimmer_hashes [wildcards .alitrim ][wildcards .aligner ]+ "/stats_trimmed_" + wildcards .alitrim + "_" + wildcards .aligner + "-" + str (i )+ "-" + str (config ["concurrency" ])+ ".txt" )
241
+ lis .append ("results/statistics/trim-" + wildcards .aligner + "-" + wildcards .alitrim + "." + trimmed_hashes [wildcards .alitrim ][wildcards .aligner ]+ "/stats_trimmed_" + wildcards .alitrim + "_" + wildcards .aligner + "-" + str (i )+ "-" + str (config ["concurrency" ])+ ".txt" )
224
242
return lis
225
243
226
244
def get_aligner_hash (wildcards ):
227
245
return aligner_hashes [wildcards .aligner ]
228
246
247
+ def get_trimmer_hash (wildcards ):
248
+ return trimmed_hashes [wildcards .alitrim ][wildcards .aligner ]
249
+
229
250
rule filter_alignments :
230
251
input :
231
- # expand("results/statistics/trim-{{aligner}}-{{alitrim}}/stats_trimmed_{{alitrim}}_{{aligner}}-{batch}-"+str(config["concurrency"])+".txt", aligner=config["alignment"]["method"], alitrim=config["trimming"]["method"], batch=batches)
232
- pull_trimmer
252
+ pull_trimmer ,
253
+ "results/alignments/filtered/{aligner}-{alitrim}.{hash}/parameters.filter-align.{aligner}-{alitrim}.{hash}.yaml"
233
254
output :
234
255
filter_info = "results/statistics/filter-{aligner}-{alitrim}.{hash}/alignment_filter_information_{alitrim}_{aligner}.txt" ,
235
- trim_info = "results/statistics/trim-{aligner}-{alitrim}.{hash}/statistics_trimmed_{alitrim}_{aligner}.txt"
236
256
benchmark :
237
257
"results/statistics/benchmarks/align/filter_alignments_{alitrim}_{aligner}.{hash}.txt"
238
258
singularity :
@@ -244,47 +264,43 @@ rule filter_alignments:
244
264
min_pars_sites = config ["trimming" ]["min_parsimony_sites" ],
245
265
max_rcv_score = config ["trimming" ]["max_rcv_score" ],
246
266
aligner_hash = get_aligner_hash ,
267
+ trimmer_hash = get_trimmer_hash ,
247
268
target_dir = "results/alignments/filtered/{aligner}-{alitrim}.{hash}"
248
269
shell :
249
270
"""
250
- if [[ -d {params.target_dir} ]]; then
251
- rm -rf {params.target_dir}
252
- fi
253
- mkdir -p {params.target_dir}
254
271
255
272
# concatenate the statistics files from the individual batches (for some reason snakemake complained if I did it all in one step, so this looks a bit ugly now, but it runs)
256
- cat results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/stats_trimmed_{wildcards.alitrim}_{wildcards.aligner}-* > results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt
257
- head -n 1 results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt > results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt
273
+ cat results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/stats_trimmed_{wildcards.alitrim}_{wildcards.aligner}-* > results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt
274
+ head -n 1 results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt > results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt
258
275
259
276
260
277
echo "\\ n ######## BEFORE #######"
261
- grep -v "^alignment" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt >> {output.trim_info}
278
+ grep -v "^alignment" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt >> results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt
262
279
echo "\\ n ######## AFTER #######"
263
- rm results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt
264
- for file in results/alignments/trimmed/{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash }/*.fas;
280
+ rm results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/statistics_trimmed_temp-{wildcards.alitrim}_{wildcards.aligner}.txt
281
+ for file in results/alignments/trimmed/{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash }/*.fas;
265
282
do
266
283
if [[ -s {params.wd}/$file ]]; then
267
284
if [[ "$(cat {params.wd}/$file | grep ">" -c)" -lt {params.minsp} ]]; then
268
285
echo -e "$file\t TOO_FEW_SEQUENCES" 2>&1 | tee -a {output.filter_info}
269
286
echo "$(date) - File $file contains less than {params.minsp} sequences after trimming with {params.trimming_method}. This file will not be used for tree reconstruction." >> {params.wd}/results/statistics/runlog.txt
270
287
continue
271
288
fi
272
- if [[ $(grep "$(basename $file)" {output.trim_info} | cut -f 6) -ge {params.min_pars_sites} && $(grep "$(basename $file)" {output.trim_info} | cut -f 9 | awk '{{ if ($1 <= {params.max_rcv_score}) {{print "OK"}} }}') == "OK" ]]; then
289
+ if [[ $(grep "$(basename $file)" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt | cut -f 6) -ge {params.min_pars_sites} && $(grep "$(basename $file)" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt | cut -f 9 | awk '{{ if ($1 <= {params.max_rcv_score}) {{print "OK"}} }}') == "OK" ]]; then
273
290
echo -e "$file\t PASS" 2>&1 | tee -a {output.filter_info}
274
- ln -s {params.wd}/$file {params.wd}/{params.target_dir}/
291
+ ln -s -f {params.wd}/$file {params.wd}/{params.target_dir}/
275
292
continue
276
- elif [[ $(grep "$(basename $file)" {output.trim_info} | cut -f 6) -lt {params.min_pars_sites} ]]; then # case if alignment has too few pars inf sites
293
+ elif [[ $(grep "$(basename $file)" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt | cut -f 6) -lt {params.min_pars_sites} ]]; then # case if alignment has too few pars inf sites
277
294
echo -e "$file\t NOT_INFORMATIVE" 2>&1 | tee -a {output.filter_info}
278
295
continue
279
- elif [[ $(grep "$(basename $file)" {output.trim_info} | cut -f 9 | awk '{{ if ($1 <= {params.max_rcv_score}) {{print "OK"}} }}') != "OK" ]]; then # case if rcv is too high
296
+ elif [[ $(grep "$(basename $file)" results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{params.trimmer_hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt | cut -f 9 | awk '{{ if ($1 <= {params.max_rcv_score}) {{print "OK"}} }}') != "OK" ]]; then # case if rcv is too high
280
297
echo -e "$file\t RCV_TOO_HIGH" 2>&1 | tee -a {output.filter_info}
281
298
continue
282
299
else # should not happen!
283
300
echo -e "$file\t INVALID" 2>&1 | tee -a {output.filter_info}
284
301
continue
285
302
fi
286
303
287
- # python bin/filter_alignments.py --alignments {params.wd}/$file --outdir "{params.wd}/{params.target_dir}" --statistics-file results/statistics/trim-{wildcards.aligner}-{wildcards.alitrim}.{wildcards.hash}/statistics_trimmed_{wildcards.alitrim}_{wildcards.aligner}.txt --min-parsimony {params.min_pars_sites} --minsp {params.minsp} >> {output.filter_info}
288
304
else #do nothing if file is empty (happens rarely when ALICUT fails)
289
305
continue
290
306
fi
@@ -320,18 +336,18 @@ rule get_filter_statistics:
320
336
321
337
def pull_filtered_stats (wildcards ):
322
338
lis = []
323
- for t in trimmer_hashes .keys ():
324
- for a in trimmer_hashes [t ].keys ():
339
+ for t in filtered_hashes .keys ():
340
+ for a in filtered_hashes [t ].keys ():
325
341
for i in range (1 , config ["concurrency" ] + 1 ):
326
- lis .append ("results/statistics/filter-" + a + "-" + t + "." + trimmer_hashes [t ][a ]+ "/statistics_filtered_" + t + "_" + a + "-" + str (i )+ "-" + str (config ["concurrency" ])+ ".txt" )
342
+ lis .append ("results/statistics/filter-" + a + "-" + t + "." + filtered_hashes [t ][a ]+ "/statistics_filtered_" + t + "_" + a + "-" + str (i )+ "-" + str (config ["concurrency" ])+ ".txt" )
327
343
return lis
328
344
329
345
def pull_algn_info (wildcards ):
330
346
lis = []
331
- for t in trimmer_hashes .keys ():
332
- for a in trimmer_hashes [t ].keys ():
347
+ for t in filtered_hashes .keys ():
348
+ for a in filtered_hashes [t ].keys ():
333
349
for i in range (1 , config ["concurrency" ] + 1 ):
334
- lis .append ("results/statistics/filter-" + a + "-" + t + "." + trimmer_hashes [t ][a ]+ "/alignment_filter_information_" + t + "_" + a + ".txt" )
350
+ lis .append ("results/statistics/filter-" + a + "-" + t + "." + filtered_hashes [t ][a ]+ "/alignment_filter_information_" + t + "_" + a + ".txt" )
335
351
return lis
336
352
337
353
rule filter_align :
0 commit comments