Skip to content

Commit 9a1aa16

Browse files
authored
Improve ondisk benchmarks (#132)
1 parent 0cd3b19 commit 9a1aa16

File tree

2 files changed

+22
-17
lines changed

2 files changed

+22
-17
lines changed

benchmark/benchmark_ondisk.jl

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ function psample_file_pop(data, rngs, n)
5151
push!(samples, s)
5252
push!(weights, Wtot)
5353
if length(samples) == 10
54-
samples = [combine(rngs[j], samples, weights),]
54+
samples = [combine(rngs, samples, weights),]
5555
weights = [sum(weights),]
5656
end
5757
end
5858
end
5959
end
60-
return combine(rngs[1], samples, weights)
60+
return combine(rngs, samples, weights)
6161
end
6262

6363
function sample_file_rs(data, rng, n, alg)
@@ -97,7 +97,7 @@ function psample_file_st(data, rngs, n, alg)
9797
samples[i] = collect(StreamSampler{dtype}(rngs[i], @view(data[c]), wf, n, W, alg))
9898
weights[i] = W
9999
end
100-
return combine(rngs[1], samples, weights)
100+
return combine(rngs, samples, weights)
101101
end
102102

103103
filename = "random_data.arrow"
@@ -117,10 +117,15 @@ precompile(sample_file_st, typeof.((data, rng, n, AlgORDWSWR())))
117117
precompile(psample_file_st, typeof.((data, rngs, n, AlgORDWSWR())))
118118

119119
times = []
120-
for n in (totaltpl ÷ 100000, totaltpl ÷ 10000, totaltpl ÷ 1000)
121-
t1 = @elapsed sample_file_pop(data, rng, n);
122-
t2 = @elapsed psample_file_pop(data, rngs, n);
123-
120+
for n in (totaltpl ÷ 100000, totaltpl ÷ 10000, totaltpl ÷ 1000, totaltpl ÷ 100)
121+
122+
if n != totaltpl ÷ 100
123+
t1 = @elapsed sample_file_pop(data, rng, n);
124+
t2 = @elapsed psample_file_pop(data, rngs, n);
125+
else
126+
t1 = nothing
127+
t2 = nothing
128+
end
124129
t3 = @elapsed sample_file_st(data, rng, n, AlgORDWSWR());
125130
t4 = @elapsed psample_file_st(data, rngs, n, AlgORDWSWR());
126131

@@ -133,9 +138,9 @@ times = hcat(times...)
133138

134139
using CairoMakie
135140

136-
x = 1:3
137-
xtick_positions = [1,2,3]
138-
xtick_labels = ["0.001%","0.01%","0.1%"]
141+
x = 1:4
142+
xtick_positions = [1,2,3,4]
143+
xtick_labels = ["0.001%","0.01%","0.1%","1%"]
139144

140145
algonames = ["chunks", "chunks (4 threads)", "stream", "stream (4 threads)",
141146
"reservoir", "reservoir (4 threads)",]
@@ -147,21 +152,21 @@ ax = Axis(fig[1, 1]; xlabel = "sample size", ylabel = "time (s)",
147152
xticks = (xtick_positions, xtick_labels),
148153
xgridstyle = :dot, ygridstyle = :dot,
149154
xticklabelsize = 10, yticklabelsize = 10,
150-
xlabelsize = 12, ylabelsize = 12,
155+
xlabelsize = 12, ylabelsize = 12
151156
)
152157

153158
for i in 1:size(times, 1)
154-
scatterlines!(ax, x, times[i, :];
159+
scatterlines!(ax, x, [x == nothing ? Inf : x for x in times[i, :]];
155160
label = algonames[i],
156161
linestyle = (:dash, :dense),
157162
marker = markers[i],
158-
markersize = 8,
159-
linewidth = 2)
163+
markersize = 13,
164+
linewidth = 2,)
160165
end
161166

162-
167+
ylims!(low=0, high = 250)
163168
fig[2, 1] = Legend(fig, ax, framevisible = false, orientation = :horizontal,
164169
halign = :center, nbanks=2, fontsize=10)
165170

166171
fig
167-
save("comparison_ondisk_algs.pdf", fig)
172+
save("comparison_ondisk_algs.svg", fig)

docs/src/benchmark.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ We also tried to evaluate the performance of the procedures on persistent data.
2121
performance of weighted sampling with replacement from 100 GB of data in the arrow format stored on
2222
disk:
2323

24-
![comparison_ondisk_algs](https://github.com/user-attachments/assets/622c5d03-07f2-428c-9bb5-6d6fcc629bec)
24+
![comparison_ondisk_algs](https://github.com/user-attachments/assets/a6bc09a0-12c9-4a7b-9cc7-0e25edf35eba)
2525

2626
the "chunks" method uses `StatsBase.sample` along with the merging methods of this package to sample
2727
subsequent chunks of the stored data and then recombine the samples. The other methods employ the

0 commit comments

Comments
 (0)