Skip to content

Commit 908d6b5

Browse files
committed
Improved edge case handling in MBE; bam2pe cleanup; minor fixes
1 parent 2342d27 commit 908d6b5

6 files changed

+411
-172
lines changed

pipeline/GoodPanGenomeGraph.snakefile

+24-20
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ copts = config["clusterOpts"]
3030

3131
rule all:
3232
input:
33-
chrsize = expand(outdir + "{genome}.{hap}.chrSize", genome=genomes, hap=haps),
33+
#chrsize = expand(outdir + "{genome}.{hap}.chrSize", genome=genomes, hap=haps),
3434
faAln = expand(outdir + "{genome}.{hap}.aln.foo", genome=genomes, hap=haps),
3535
lift = expand(outdir + "{genome}/lift.foo", genome=genomes),
3636
TRfa = expand(outdir + "{genome}.{hap}.tr.fasta", genome=genomes, hap=haps),
@@ -65,9 +65,6 @@ ln -sf {input.fa} .
6565
for hap in 0 1; do
6666
fa={params.indir}/{wildcards.genome}.$hap.fa
6767
ln -sf "$fa".fai .
68-
#samtools faidx $fa &
69-
{params.sd}/script/chrsize.sh $fa > {wildcards.genome}.$hap.chrSize
70-
wait
7168
done
7269
"""
7370

@@ -209,13 +206,14 @@ rule JointTRAnnotation:
209206
mapping = outdir + "OrthoMap.v2.tsv",
210207
TRfa = expand(outdir + "{genome}.{hap}.tr.fasta", genome=genomes, hap=haps),
211208
resources:
212-
cores = 6,
213-
mem = lambda wildcards, attempt: 40 + 20*(attempt-1)
209+
cores = 12,
210+
mem = lambda wildcards, attempt: 110
214211
priority: 96
215212
params:
216213
copts = copts,
217214
sd = srcdir,
218215
od = outdir,
216+
indir = indir,
219217
refTR = config["refTR"],
220218
ksize = ksize,
221219
FS = FS,
@@ -230,15 +228,16 @@ set -eu
230228
ulimit -c 20000
231229
cd {params.od}
232230
233-
echo "Generating panbed"
231+
printf "Generating panbed"
234232
cut -f 1-3 {params.refTR} >pan.tr.mbe.v0.bed
235233
for g in {params.genomes}; do
234+
printf "."
236235
bedtools map -c 1 -o count -a pan.tr.mbe.v0.bed -b <(cut -f 4-6 $g/tmp1.0.bed) >pan.tr.mbe.v0.bed.tmp &&
237236
mv pan.tr.mbe.v0.bed.tmp pan.tr.mbe.v0.bed
238237
done
239-
#{params.sd}/script/preMBE.py {params.pairs} pan.tr.mbe.v0.bed {params.TRwindow}
240-
{params.sd}/script/multiBoundaryExpansion.py {params.ksize} {params.FS} {params.TRwindow} {params.pairs} pan.tr.mbe.v0.bed {params.th1} {params.th2}
241-
#{params.sd}/script/writeMBEbed.py {params.th1} {params.th2}
238+
echo ""
239+
mkdir -p MBE
240+
{params.sd}/script/multiBoundaryExpansion.parallel.py {params.ksize} {params.FS} {params.TRwindow} {params.pairs} pan.tr.mbe.v0.bed {params.th1} {params.th2} {resources.cores} {params.indir}
242241
hi=0
243242
for g in {params.genomes}; do
244243
for h in 0 1; do
@@ -255,10 +254,10 @@ for g in {params.genomes}; do
255254
done
256255
done >mbe.m0.loci
257256
rm tmp.bed
258-
{params.sd}/script/mergeMBEbed.py {params.pairs} pan.tr.mbe.v0.bed
257+
{params.sd}/script/mergeMBEbed.py {params.pairs} {params.th2}
259258
260259
### write fasta
261-
echo "Fetching TR+flank"
260+
echo "Fetching TR+flank" $(date)
262261
hi=0
263262
for g in {params.genomes}; do
264263
for h in 0 1; do
@@ -269,7 +268,7 @@ for g in {params.genomes}; do
269268
$3=$3+{params.FS}
270269
print $0
271270
}}' |
272-
{params.sd}/script/SelectRegions.py /dev/stdin "$g"."$h".fa /dev/stdout |
271+
{params.sd}/script/SelectRegions.py /dev/stdin {params.indir}/"$g"."$h".fa /dev/stdout |
273272
awk '{{if ($1 ~ />/) {{print}} else {{print toupper($0)}} }}' >"$g"."$h".tr.fasta
274273
((++hi))
275274
done
@@ -284,10 +283,10 @@ rule GenRawGenomeGraph:
284283
mapping = outdir + "OrthoMap.v2.tsv",
285284
output:
286285
rawPBkmers = expand(outdir + "{{genome}}.rawPB.{kmerType}.kmers", kmerType=kmerTypes),
287-
rawILkmers = outdir + "{genome}.rawIL.tr.kmers"
286+
rawILkmers = [outdir + "{genome}.rawIL.tr.kmers"] if prune else []
288287
resources:
289-
cores = 24,
290-
mem = lambda wildcards, attempt: 20 #90 + 20*(attempt-1)
288+
cores = 24 if prune else 1,
289+
mem = lambda wildcards, attempt: 25 + 20*(attempt-1)
291290
priority: 95
292291
params:
293292
copts = copts,
@@ -299,17 +298,21 @@ rule GenRawGenomeGraph:
299298
rth = rth,
300299
rstring = rstring,
301300
thcth = thcth,
302-
hi = lambda wildcards: 2*genomes.index(wildcards.genome)
301+
hi = lambda wildcards: 2*genomes.index(wildcards.genome),
302+
prune = int(prune)
303303
shell:"""
304304
set -eu
305305
ulimit -c 20000
306306
cd {params.od}
307+
module load gcc
307308
308309
{params.sd}/bin/vntr2kmers_thread -g -m <(cut -f $(({params.hi}+1)),$(({params.hi}+2)) {input.mapping}) -k {params.ksize} -fs {params.FS} -ntr {params.FS} -o {wildcards.genome}.rawPB -fa 2 {input.TRfa}
309310
310-
samtools fasta -@2 -n {input.ILbam} |
311-
{params.sd}/bin/bam2pe -fai /dev/stdin |
312-
{params.sd}/bin/danbing-tk -g {params.thcth} -k {params.ksize} -qs {params.od}/{wildcards.genome}.rawPB -fai /dev/stdin -o {wildcards.genome}.rawIL -p {resources.cores} -cth {params.cth} -rth {params.rth}
311+
if [ {params.prune} == "1" ]; then
312+
samtools fasta -@2 -n {input.ILbam} |
313+
{params.sd}/bin/bam2pe -fai /dev/stdin |
314+
{params.sd}/bin/danbing-tk -g {params.thcth} -k {params.ksize} -qs {params.od}/{wildcards.genome}.rawPB -fai /dev/stdin -o {wildcards.genome}.rawIL -p {resources.cores} -cth {params.cth} -rth {params.rth}
315+
fi
313316
"""
314317

315318

@@ -385,6 +388,7 @@ rule GenPanGenomeGraph:
385388
shell:"""
386389
cd {params.od}
387390
ulimit -c 20000
391+
module load gcc
388392
389393
{params.sd}/bin/genPanKmers -o pan -m - -k {params.kmerpref}
390394
"""

script/liftbed.clean.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def cleanbed():
7474

7575

7676
if __name__ == "__main__":
77-
lb = np.loadtxt(sys.argv[1], dtype=object, ndmin=2)
77+
lb = np.loadtxt(sys.argv[1], dtype=object, ndmin=2, comments=None)
7878
cleanbed()
7979

8080

script/mergeMBEbed.py

+31-24
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ def parseMergeSet():
99
v2si = {}
1010
si = 0
1111
with open("mbe.m0.loci") as f:
12+
hap = ""
1213
for line in f:
1314
if line[0] == ">":
15+
hap = line.rstrip()[1:]
1416
continue
1517
seq = sorted([int(v) for v in line.rstrip().split(",")])
1618
skip = seq[0] in bs # check if good v reported by this hap is bad in another hap
@@ -23,7 +25,7 @@ def parseMergeSet():
2325
ms[si_] = None
2426
v2si.pop(v)
2527
bs.add(v)
26-
print(f"bad {seq}")
28+
print(f"Bad seq {seq} in {hap}")
2729
break
2830
else:
2931
if skip:
@@ -50,50 +52,53 @@ def parseMergeSet():
5052
ms[si_].add(v)
5153
v2si[v] = si_
5254
else:
53-
assert False, f"{seq} indicates merging across sets defined in {ms}"
55+
print(f"[Warning] {hap} {seq} indicates merging across sets, will be ignored", flush=True) # TODO enable merging >2 loci
5456
ms = np.array(ms, dtype=object)
5557
ms = ms[ms!=None].tolist()
5658
for i1s_ in ms:
5759
assert len(i1s_ & bs) == 0
5860
return ms, bs
5961

6062
def getdist(bed):
61-
"""Get the distnace between two bed entries. Return 0 if overlapping"""
62-
if int(bed[0,0]) < int(bed[1,0]): # no inversion
63-
return max(0, int(bed[1,0]) - int(bed[0,1]))
63+
out = []
64+
if int(bed[0,2]) == 1: # no inversion
65+
for i in range(bed.shape[0]-1):
66+
out.append(int(bed[i+1,0]) - int(bed[i,1]))
6467
else:
65-
return max(0, int(bed[0,0]) - int(bed[1,1]))
68+
for i in range(bed.shape[0]-1):
69+
out.append(int(bed[i,0]) - int(bed[i+1,1]))
70+
return out
6671

67-
def writeBed_MergeMBE():
72+
def writeBed_MergeMBE(MAXSVLEN=10000):
6873
ms, bs = parseMergeSet()
6974

7075
# QC on merging set
71-
panbed = np.loadtxt(f"pan.tr.mbe.v1.bed", dtype=object, ndmin=2)
72-
_, ng = panmap.shape
76+
panbed = np.loadtxt(f"pan.tr.mbe.v1.bed", dtype=object, ndmin=2, comments=None)
7377
i1togood = {}
7478
qcb = [] # QC bad
7579
for i1s_ in ms:
76-
if len(i1s_) > 2:
77-
qcb.append(i1s_)
78-
print(f"merging more than two regions: {i1s_}")
79-
continue
8080
i1s = sorted(list(i1s_))
81-
dist = np.full(2*ng, np.nan)
81+
nm = len(i1s)-1
82+
dist = np.full([nm, 2*ng], np.nan)
8283
for hi in range(2*ng):
8384
if np.all(panbed[i1s,3+hi*4] != "None"):
8485
if np.any(panbed[i1s,3+hi*4] != panbed[i1s[0],3+hi*4]):
85-
print(f"remove haplotype due to merging across contigs: {hi}\t{i1s}\n {panbed[i1s,3+hi*4]}")
86+
print(f"[Haplotype removed] merging across contigs: {hi}\t{i1s}\n {panbed[i1s,3+hi*4]}")
8687
else:
8788
if np.any(panbed[i1s,6+hi*4] != panbed[i1s[0],6+hi*4]):
88-
print("mixed orientation")
89-
dist[hi] = getdist(panbed[i1s,4+hi*4:6+hi*4])
90-
good = np.isfinite(dist) # good mask
91-
th = 3*np.std(dist[good]) + 100
92-
bad = np.abs(dist[good] - np.median(dist[good])) > th # bad outliers
93-
good[good] = ~bad
94-
if np.sum(good)/(2*ng) < 0.8: # remove locus
89+
print(f"[Note] {i1s} mixed orientation")
90+
dist[:,hi] = getdist(panbed[i1s,4+hi*4:7+hi*4])
91+
good = np.all(np.isfinite(dist), axis=0)
92+
#for i in range(nm):
93+
# th = 3*np.nanstd(dist[i]) + 100
94+
# outm = np.abs(dist[i] - np.nanmedian(dist[i])) <= th # outlier haps # XXX more robust thresholding
95+
# good &= outm
96+
if np.nanmax(dist) > MAXSVLEN:
97+
qcb.append(i1s_)
98+
print(f"[Loci removed] huge SV, {i1s}")
99+
elif np.sum(good)/(2*ng) < THRESH:
95100
qcb.append(i1s_)
96-
print(f"{i1s} removed by QC")
101+
print(f"[Loci removed] QC failed {i1s}")
97102
else:
98103
i1togood[i1s[0]] = good # record hap to remove
99104
for i1s_ in qcb:
@@ -150,5 +155,7 @@ def writeBed_MergeMBE():
150155

151156
if __name__ == "__main__":
152157
gs = np.loadtxt(sys.argv[1], usecols=0, dtype=object, ndmin=1)
153-
panmap = np.loadtxt(sys.argv[2], dtype=object, ndmin=2)[:,3:].astype(int)
158+
ng = gs.size
159+
#panmap = np.loadtxt(sys.argv[2], dtype=object, ndmin=2)[:,3:].astype(int)
160+
THRESH = float(sys.argv[2])
154161
writeBed_MergeMBE()

0 commit comments

Comments
 (0)