-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimulate_read_distribution.py
More file actions
119 lines (101 loc) · 2.95 KB
/
simulate_read_distribution.py
File metadata and controls
119 lines (101 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
from numpy.random import exponential as expo
from random import randint,sample
from Bio import SeqIO
from argparse import ArgumentParser
from bisect import bisect_left, bisect_right
parser = ArgumentParser()
parser.add_argument("contigfile", help="Contig File")
parser.add_argument("--lenfile", help="Length of Reads File")
parser.add_argument("--ul_lenfile", help="Length of Ultralong-Reads File")
args = parser.parse_args()
contigs = {}
for read in SeqIO.parse(args.contigfile, "fasta"):
contigs[read.id] = len(read.seq)
lengths = []
if args.lenfile:
with open(args.lenfile) as f:
for line in f:
lengths.append(int(line.rstrip()))
ul_lengths = []
if args.ul_lenfile:
with open(args.ul_lenfile) as f:
for line in f:
ul_lengths.append(int(line.rstrip()))
lchr6 =170805979
smhc = 28510120
emhc = 33480577
lmhc = emhc - smhc
pos = 0
nr = 0
crucial_points = []
while pos < lmhc:
ctgs = sample(contigs.keys(),1)
crucial_points.append(pos + contigs[ctgs[0]])
pos += contigs[ctgs[0]]
#print(pos)
nr += 1
cp_fixed = {}
# LSK108
# Number of reads in chr6 and or in contigs
nreads = 37844 + 1085
#nreads = nreads*5
nul_reads = 10511 - 9 + 4735 - 2208
nreads = int(nreads*2)
#nul_reads = nul_reads*2
def find_ge(a, x):
'Find leftmost index with item greater than or equal to x'
i = bisect_left(a, x)
if i != len(a):
return i
else:
return len(a)-1
raise ValueError
#nreads = int(nreads/2)
good = 0
bad = 0
unbridged = 0
runs = 1000
for run in range(0,runs):
for cp in crucial_points:
cp_fixed[cp] = 0
#mhc = np.zeros(lmhc)
# the length can be modelled as an exponential distribution
if args.lenfile:
draws = sample(lengths,nreads)
else:
draws = expo(8000,[nreads,1])+1000
if args.ul_lenfile:
ul_draws = sample(ul_lengths,nul_reads)
else:
ul_draws = []
print(" " + str(run+1)+"\r" , end='', flush = True)
all_draws = draws + ul_draws
for draw in all_draws:
beg = randint(1,int(lchr6-draw))
end = beg+draw
bidx = 0
eidx = 0
if (beg > smhc and beg < emhc) or (end > smhc and end < emhc):
if beg > smhc:
bidx = int(beg-smhc -1)
else:
bidx = 0
if end < emhc:
eidx = int(bidx + draw)
else:
eidx = int(lmhc)
if eidx - bidx < 100:
continue
cpil = bisect_right(crucial_points, bidx+50)
cpir = bisect_right(crucial_points, eidx-50)
if cpil < cpir:
for cp in crucial_points[cpil:cpir+1]:
cp_fixed[cp] += 1
#print(cp_fixed)
if 0 not in cp_fixed.values():
good += 1
else:
bad += 1
unbridged += list(cp_fixed.values()).count(0)
print("Good: " + str(good) + "\tBad: " + str(bad) +"\tUnbridged: " + str(unbridged/runs))