-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmuseoscript.sh
239 lines (196 loc) · 7.55 KB
/
museoscript.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/bin/bash
# Loïs Rancilhac (loisrancilhac@gmail.com)
# September 2019
# dependencies :
# trimmomatic (http://www.usadellab.org/cms/?page=trimmomatic)
# seqtk (https://github.com/lh3/seqtk)
# vsearch (https://github.com/torognes/vsearch)
# parallel
###### PARSE ARGUMENTS #######
err="\n script to map illumina reads to a set of reference sequences, aimed at identifying museum specimens \n
\n
dependencies:\n
trimmomatic (http://www.usadellab.org/cms/?page=trimmomatic)\n
seqtk (https://github.com/lh3/seqtk)\n
vsearch (https://github.com/torognes/vsearch)\n
\n
arguments:\n
-h | --help : print this message and exit\n
-r | --ref : path to reference sequences (in fasta format) \n
-d | --raw : path to raw illumina reads (in .fastq.gz format) \n
-w | --work : path to working directory\n
-P | --phred : phred score below which bases should be masked with a N\n
-t | --threshold : similarity threshold to be used by vsearch to map align the reads\n
-T | --threads : number of threads to use. Default = 1\n
-C | --clean : Y/N, whether to trim adapters and quality check reads before aligning to the reference. Default = Y;
"
if [[ $# == 0 ]]
then
echo -e $err
exit 1
else
while [[ $# -gt 0 ]]
do
key="$1"
case "$key" in
-h | --help)
echo -e $err
shift
shift
;;
-r | --ref)
REF=$2
shift
shift
;;
-d | --raw)
RAW=$2
shift
shift
;;
-w | --work)
WORK=$2
shift
shift
;;
-t | --threshold)
THRESH=$2
shift
shift
;;
-P | --phred)
PHRED=$2
shift
shift
;;
-T | --threads)
THREAD=$2
shift
shift
;;
-C | --clean)
CLEAN=$2
shift
shift
;;
esac
done
fi
###### MAIN SCRIPT #######
# check for fastq files in the raw data directory and return an error if none found
FASTQ=$(ls $RAW/*.fastq.gz)
if [[ -z $FASTQ ]]
then
echo "no fastq files could be found in" $RAW
exit 1
else
NSAMPLES=$(echo $FASTQ | wc -w)
LSAMPLES=$(echo $FASTQ | sed 's/\.fastq\.gz//g' | sed 's/ /\n/g')
echo "$NSAMPLES samples to be processed:
$LSAMPLES"
fi
if [[ -z $THREAD ]]
then
THREAD=1
fi
if [[ -z $CLEAN ]]
then
CLEAN=Y
fi
#### STEP 1 ⁻ Data cleaning ####
if [[ $CLEAN = Y ]]
then
echo $(date "+%D %H:%M:%S") "- Step 1 : Quality filtering and trimming"
cd $RAW
mkdir trimmed_reads
# store the trimmomatic and seqtk commands to be executed
for i in *.fastq.gz
do
name=$(echo $i | sed 's/\.fastq\.gz//')
# trimm the reads; check for quality using a sliding window
echo "java -jar ~/softwares/Trimmomatic-0.38/trimmomatic-0.38.jar SE -phred33 $i trimmed_reads/$name-trimmed.fastq.gz ILLUMINACLIP:/home/lois/softwares/Trimmomatic-0.38/adapters/TruSeq3-SE.fa:2:30:10 LEADING:10 TRAILING:10 SLIDINGWINDOW:2:25 MINLEN:36 &> trimmed_reads/trimmomatic-$name.log" >> run_trimmomatic
# translate from fastq to fasta; mask the positions with quality below 30 with N
echo "seqtk seq -A -q30 -n N trimmed_reads/$name-trimmed.fastq.gz > trimmed_reads/$name-trimmed.fasta" >> run_seqtk
done
parallel --will-cite -j $THREAD < run_trimmomatic
parallel --will-cite -j $THREAD < run_seqtk
rm run_trimmomatic run_seqtk
echo $(date "+%D %H:%M:%S") "- Quality filtering and trimming done!"
fi
echo -e "\n"
echo -e $(date "+%D %H:%M:%S") "- Step 2 : aligning the reads to the references sequences with a similarity of $THRESH \n"
#### STEP 2 Reads mapping ####
for sample in $(ls $RAW/trimmed_reads/*.fasta | sed 's/\-trimmed\.fasta//')
do
SAMPLE=$(echo $sample | rev | cut -d'/' -f 1 | rev)
echo -e "\n"
centrer_term "#################### Parsing $SAMPLE ####################"
start=$(date "+%s")
echo -e "\n"
cd $WORK
mkdir $SAMPLE
cd $SAMPLE
# this loops align the reads to each reference sequence iteratively and stores the results in a *.vsearch file, as well as the headers of the reads that aligned in a *.read file
for ref in $(ls $REF)
do
#store reference name without extension
seq=$(echo $ref | sed "s/\.[a-zA-Z]*$//")
#execute vsearch
vsearch --usearch_global $RAW/trimmed_reads/$SAMPLE-trimmed.fasta --db $REF/$ref --id $THRESH --alnout $seq-$SAMPLE.vsearch &> $seq-$SAMPLE-vsearch.log
echo $(date "+%D %H:%M:%S") reads aligned to $ref
echo "grep 'Query >' $seq-$SAMPLE.vsearch > $seq-in-$SAMPLE.reads" >> run_grep_$SAMPLE
echo "sed -i 's/Query //' $seq-in-$SAMPLE.reads" >> run_sed_$SAMPLE
done
parallel --will-cite -j $THREAD < run_grep_*
parallel --will-cite -j $THREAD < run_sed_*
rm run_grep_* run_sed_*
# this block merges all the *.reads files and sorts them depending on the number of occurences of a read (thre categories : unique, 2 or 3+ occurences)
cat *.reads | cut -d'|' -f1 | sort | uniq -c | grep "^[[:space:]]*1 " | sed 's/^[[:space:]]*1 //' > reads.uniq
cat *.reads | cut -d'|' -f1 | sort | uniq -c | grep "^[[:space:]]*2 " | sed 's/^[[:space:]]*2 //' > reads.2shared
cat *.reads | cut -d'|' -f1 | sort | uniq -c | grep -E "^[[:space:]]*[3-9]+" | sed 's/^[[:space:]]*[3-9] //' > reads.3shared
echo "###### READS CATEGORIES DONE"
# split the reads files for parralelization
if [ -s reads.uniq ] ; then split -d --additional-suffix=reads.uniq -l $(expr `wc -l reads.uniq | cut -d' ' -f1` / $THREAD) reads.uniq ; else rm reads.uniq ; fi
if [ -s reads.2shared ] ; then split -d --additional-suffix=reads.2shared -l $(expr `wc -l reads.2shared | cut -d' ' -f1` / $THREAD) reads.2shared ; else rm reads.2shared ; fi
if [ -s reads.3shared ] ; then split -d --additional-suffix=reads.3shared -l $(expr `wc -l reads.3shared | cut -d' ' -f1` / $THREAD) reads.3shared ; else rm reads.3shared ; fi
# this loop checks the number of occurences of each read in the *.read files and write grep command files to retrieve them
for reads in *.reads
do
name=$(echo $reads | cut -d'.' -f1,2)
# Create the run files to be executed by parralel, containing the grep commands
if [ -s reads.uniq ] ; then for runiq in x*reads.uniq ; do echo "while read uniq; do grep \$uniq $reads >> $name.uniq ; done < $runiq" >> run_grep_uniq ; done ; fi
if [ -s reads.2shared ] ; then for r2shared in x*reads.2shared ; do echo "while read shared2; do grep \$shared2 $reads >> $name.2shared ; done < $r2shared" >> run_grep_2shared ; done ; fi
if [ -s reads.3shared ] ; then for r3shared in x*reads.3shared ; do echo "while read shared3; do grep \$shared3 $reads >> $name.3shared ; done < $r3shared" >> run_grep_3shared ; done ; fi
done
# execute the grep files in parralel
if [ -s run_grep_uniq ] ; then parallel --will-cite -j $THREAD < run_grep_uniq ; rm run_grep_uniq x*reads.uniq ; echo $(date "+%D %H:%M:%S") "done classifying unique matches" ; fi
if [ -s run_grep_2shared ] ; then parallel --will-cite -j $THREAD < run_grep_2shared ; rm run_grep_2shared x*reads.2shared ; echo $(date "+%D %H:%M:%S") "done classifying double matches" ; fi
if [ -s run_grep_3shared ] ; then parallel --will-cite -j $THREAD < run_grep_3shared ; rm run_grep_3shared x*reads.3shared ; echo $(date "+%D %H:%M:%S") "done classifying multiple matches" ; fi
# remove empty files
for i in *; do if [ ! -s $i ]; then echo $i "was deleted : file empty"; rm $i; fi; done
# generate summary file in csv format
echo "Threshold = $THRESH" > summary_"$SAMPLE".csv
echo "ref" "$SAMPLE" "Nuniq" "Nshared_2" "Nshared_3" >> summary_"$SAMPLE".csv
# count the number of reads of each category and writes it to the summary file
for i in $(ls *.reads | cut -d'.' -f1,2)
do
ref=$(echo $i | sed 's/_in.*//')
if [ -e "$i".uniq ]
then
Nuniq=$(wc -l "$i".uniq | cut -d' ' -f1)
else Nuniq=0
fi
if [ -e "$i".2shared ]
then
Nshared2=$(wc -l "$i".2shared | cut -d' ' -f1)
else Nshared2=0
fi
if [ -e "$i".3shared ]
then
Nshared3=$(wc -l "$i".3shared | cut -d' ' -f1)
else Nshared3=0
fi
echo $ref $SAMPLE $Nuniq $Nshared2 $Nshared3 >> summary_"$SAMPLE".csv
done
cat summary_"$SAMPLE".csv | sed '1,2d' >> $WORK/summary.csv
done