diff --git a/Cargo.lock b/Cargo.lock index 0b55a11..5ce4c8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloca" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" +dependencies = [ + "cc", +] + [[package]] name = "anes" version = "0.1.6" @@ -71,6 +80,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + [[package]] name = "bumpalo" version = "3.19.0" @@ -83,6 +98,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cc" +version = "1.2.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfb" version = "0.12.1" @@ -92,6 +117,7 @@ dependencies = [ "fnv", "rand", "rand_pcg", + "tempfile", "time", "uuid", "web-time", @@ -178,10 +204,11 @@ checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "criterion" -version = "0.7.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" +checksum = "4d883447757bb0ee46f233e9dc22eb84d93a9508c9b868687b274fc431d886bf" dependencies = [ + "alloca", "anes", "cast", "ciborium", @@ -190,6 +217,7 @@ dependencies = [ "itertools", "num-traits", "oorandom", + "page_size", "plotters", "rayon", "regex", @@ -201,9 +229,9 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.6.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" +checksum = "ed943f81ea2faa8dcecbbfa50164acf95d555afec96a27871663b300e387b2e4" dependencies = [ "cast", "itertools", @@ -246,6 +274,28 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" + [[package]] name = "fnv" version = "1.0.7" @@ -263,6 +313,18 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "half" version = "2.7.1" @@ -307,9 +369,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.144" +version = "0.2.180" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "memchr" @@ -338,6 +406,16 @@ version = "11.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" +[[package]] +name = "page_size" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "plotters" version = "0.3.7" @@ -390,6 +468,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" version = "0.8.5" @@ -417,7 +501,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.9", ] [[package]] @@ -478,6 +562,19 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +[[package]] +name = "rustix" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -542,6 +639,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "strsim" version = "0.11.1" @@ -559,6 +662,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" +dependencies = [ + "fastrand", + "getrandom 0.3.4", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "time" version = "0.3.21" @@ -619,6 +735,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "wasm-bindgen" version = "0.2.106" @@ -684,6 +809,22 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -693,6 +834,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-sys" version = "0.52.0" @@ -759,6 +906,12 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + [[package]] name = "zerocopy" version = "0.8.31" diff --git a/Cargo.toml b/Cargo.toml index 35e6dce..d0430ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,8 @@ clap = { version = "4.4", features = ["derive"] } rand = "0.8" rand_pcg = "0.3" time = "0.3" -criterion = { version = "0.7", features = ["html_reports"] } +criterion = { version = "0.8", features = ["html_reports"] } +tempfile = "3" [[bench]] name = "benchmark" diff --git a/benches/benchmark.rs b/benches/benchmark.rs index 5ab1b57..0fb4740 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -1,46 +1,137 @@ -use cfb::CompoundFile; +use cfb::{CompoundFile, CreateStreamOptions}; use criterion::{ criterion_group, criterion_main, BenchmarkId, Criterion, Throughput, }; +use std::fs::OpenOptions; use std::hint::black_box; use std::io::Cursor; use std::io::Write; +use tempfile::NamedTempFile; -fn write_many_streams(n: usize, size: usize) -> Vec { +fn write_many_streams( + n: usize, + size: usize, + stream_buffer_size: Option, +) -> Vec { let mut buff = Vec::new(); { let mut test_comp = CompoundFile::create(Cursor::new(&mut buff)).unwrap(); let data = vec![0; size]; for i in 0..n { - let mut stream = - test_comp.create_stream(format!("test{i}")).unwrap(); + let name = format!("test{i}"); + let mut stream = match stream_buffer_size { + Some(buf_size) => { + let options = + CreateStreamOptions::new().buffer_size(buf_size); + test_comp + .create_stream_with_options(name, options) + .unwrap() + } + None => test_comp.create_stream(name).unwrap(), + }; stream.write_all(&data).unwrap(); } } buff } +fn write_many_streams_disk( + n: usize, + size: usize, + stream_buffer_size: Option, +) { + let tmpfile = NamedTempFile::new().unwrap(); + { + let mut test_comp = CompoundFile::create( + OpenOptions::new() + .read(true) + .write(true) + .open(tmpfile.path()) + .unwrap(), + ) + .unwrap(); + let data = vec![0; size]; + for i in 0..n { + let name = format!("test{i}"); + let mut stream = match stream_buffer_size { + Some(buf_size) => { + let options = + CreateStreamOptions::new().buffer_size(buf_size); + test_comp + .create_stream_with_options(name, options) + .unwrap() + } + None => test_comp.create_stream(name).unwrap(), + }; + stream.write_all(&data).unwrap(); + } + } + // File is deleted when tmpfile is dropped +} + fn criterion_benchmark(c: &mut Criterion) { - // many small streams + // Buffer sizes to compare (in bytes). `None` means the default internal + // stream buffer size. + let buffer_sizes: &[(Option, &str)] = &[ + (None, "default"), + (Some(8 * 1024), "buffer=8192"), + (Some(64 * 1024), "buffer=65536"), + (Some(256 * 1024), "buffer=262144"), + (Some(1024 * 1024), "buffer=1048576"), + ]; + + // many small streams with throughput reporting + let mut small = c.benchmark_group("write many smaller streams"); let size = 64usize; let n = 1000; - c.bench_function("write many smaller streams", |b| { - b.iter(|| { - let out = write_many_streams(black_box(n), black_box(size)); - black_box(out); - }) - }); + let total_bytes = (n * size) as u64; + small.sample_size(10); + small.throughput(Throughput::Bytes(total_bytes)); - // a few medium streams + for (buf, label) in buffer_sizes { + small.bench_with_input( + BenchmarkId::new("total", *label), + &size, + |b, &s| { + b.iter(|| { + let out = write_many_streams( + black_box(n), + black_box(s), + buf.map(black_box), + ); + black_box(out); + }) + }, + ); + } + small.finish(); + + // several medium streams with throughput reporting + let mut medium = c.benchmark_group("write several medium streams"); let size = 1024 * 1024usize; let n = 50; - c.bench_function("write several medium streams", |b| { - b.iter(|| { - let out = write_many_streams(black_box(n), black_box(size)); - black_box(out); - }) - }); + let total_bytes = (n * size) as u64; + medium.sample_size(10); + medium.throughput(Throughput::Bytes(total_bytes)); + + for (buf, label) in buffer_sizes { + medium.bench_with_input( + BenchmarkId::new("total", *label), + &size, + |b, &s| { + b.iter(|| { + let out = write_many_streams( + black_box(n), + black_box(s), + buf.map(black_box), + ); + black_box(out); + }) + }, + ); + } + medium.finish(); // single large stream with throughput reporting let mut group = c.benchmark_group("write large stream"); @@ -48,17 +139,48 @@ fn criterion_benchmark(c: &mut Criterion) { let n = 1; group.sample_size(10); group.throughput(Throughput::Bytes(size as u64)); - group.bench_with_input( - BenchmarkId::from_parameter("size"), - &size, - |b, &s| { - b.iter(|| { - let out = write_many_streams(black_box(n), black_box(s)); - black_box(out); - }) - }, - ); + for (buf, label) in buffer_sizes { + group.bench_with_input( + BenchmarkId::new("total", *label), + &size, + |b, &s| { + b.iter(|| { + let out = write_many_streams( + black_box(n), + black_box(s), + buf.map(black_box), + ); + black_box(out); + }) + }, + ); + } group.finish(); + + // many small streams with throughput reporting (disk) + let mut small_disk = + c.benchmark_group("write many smaller streams (disk)"); + let size = 64usize; + let n = 1000; + let total_bytes = (n * size) as u64; + small_disk.sample_size(10); + small_disk.throughput(Throughput::Bytes(total_bytes)); + for (buf, label) in buffer_sizes { + small_disk.bench_with_input( + BenchmarkId::new("total", *label), + &size, + |b, &s| { + b.iter(|| { + write_many_streams_disk( + black_box(n), + black_box(s), + buf.map(black_box), + ); + }) + }, + ); + } + small_disk.finish(); } criterion_group!(benches, criterion_benchmark); diff --git a/src/internal/stream.rs b/src/internal/stream.rs index 2210291..f18abbb 100644 --- a/src/internal/stream.rs +++ b/src/internal/stream.rs @@ -4,16 +4,12 @@ use std::sync::{Arc, RwLock, Weak}; //===========================================================================// -const BUFFER_SIZE: usize = 8192; - -//===========================================================================// - /// A stream entry in a compound file, much like a filesystem file. pub struct Stream { minialloc: Weak>>, stream_id: u32, total_len: u64, - buffer: Box<[u8; BUFFER_SIZE]>, + buffer: Vec, buf_pos: usize, buf_cap: usize, buf_offset_from_start: u64, @@ -24,14 +20,16 @@ impl Stream { pub(crate) fn new( minialloc: &Arc>>, stream_id: u32, + buffer_size: usize, ) -> Stream { let total_len = minialloc.read().unwrap().dir_entry(stream_id).stream_len; + let buffer_size = buffer_size.max(1); Stream { minialloc: Arc::downgrade(minialloc), stream_id, total_len, - buffer: Box::new([0; BUFFER_SIZE]), + buffer: vec![0; buffer_size], buf_pos: 0, buf_cap: 0, buf_offset_from_start: 0, diff --git a/src/lib.rs b/src/lib.rs index 2c216a1..0d5ec8a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,8 @@ pub use crate::internal::{Entries, Entry, Stream, Version}; #[macro_use] mod internal; +const DEFAULT_STREAM_BUFFER_SIZE: usize = 8192; + //===========================================================================// /// Opens an existing compound file at the given path in read-only mode. @@ -102,6 +104,78 @@ fn create_with_path(path: &Path) -> io::Result> { //===========================================================================// +/// Options for creating a stream within a compound file. +pub struct CreateStreamOptions { + pub(crate) buffer_size: usize, + pub(crate) overwrite: bool, +} + +impl CreateStreamOptions { + /// Creates a new `CreateStreamOptions` with default settings. + pub fn new() -> Self { + CreateStreamOptions::default() + } + + /// Sets the buffer size to use when reading/writing the stream. + /// + /// The buffer size determines how much data is read/written at a time when + /// accessing the stream. A larger buffer size can improve performance for + /// large streams, while a smaller buffer size can reduce memory usage. + pub fn buffer_size(mut self, size: usize) -> Self { + self.buffer_size = size; + self + } + + /// Sets whether to overwrite an existing stream at the given path when + /// creating the stream. + /// + /// If `overwrite` is set to `true`, and a stream already exists at the + /// given path, the existing stream will be truncated to zero length. + /// If `overwrite` is set to `false`, and a stream already exists at the + /// given path, an error will be returned. + pub fn overwrite(mut self, overwrite: bool) -> Self { + self.overwrite = overwrite; + self + } +} + +impl Default for CreateStreamOptions { + fn default() -> Self { + CreateStreamOptions { + buffer_size: DEFAULT_STREAM_BUFFER_SIZE, + overwrite: false, + } + } +} + +/// Options for opening a stream within a compound file. +pub struct OpenStreamOptions { + pub(crate) buffer_size: usize, +} + +impl OpenStreamOptions { + /// Creates a new `StreamOptions` with default settings. + pub fn new() -> Self { + OpenStreamOptions::default() + } + + /// Sets the buffer size to use when reading/writing the stream. + /// + /// The buffer size determines how much data is read/written at a time when + /// accessing the stream. A larger buffer size can improve performance for + /// large streams, while a smaller buffer size can reduce memory usage. + pub fn buffer_size(mut self, size: usize) -> Self { + self.buffer_size = size; + self + } +} + +impl Default for OpenStreamOptions { + fn default() -> Self { + OpenStreamOptions { buffer_size: DEFAULT_STREAM_BUFFER_SIZE } + } +} + /// A compound file, backed by an underlying reader/writer (such as a /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) or /// [`Cursor`](https://doc.rust-lang.org/std/io/struct.Cursor.html)). @@ -307,10 +381,28 @@ impl CompoundFile { &mut self, path: P, ) -> io::Result> { - self.open_stream_with_path(path.as_ref()) + self.open_stream_with_path(path.as_ref(), DEFAULT_STREAM_BUFFER_SIZE) + } + + /// Opens an existing stream in the compound file for reading and/or + /// writing (depending on what the underlying file supports), with a specified buffer size. + /// + /// The buffer size determines how much data is read/written at a time when + /// accessing the stream. A larger buffer size can improve performance for + /// large streams, while a smaller buffer size can reduce memory usage. + pub fn open_stream_with_options>( + &mut self, + path: P, + options: OpenStreamOptions, + ) -> io::Result> { + self.open_stream_with_path(path.as_ref(), options.buffer_size) } - fn open_stream_with_path(&mut self, path: &Path) -> io::Result> { + fn open_stream_with_path( + &mut self, + path: &Path, + buffer_size: usize, + ) -> io::Result> { let names = internal::path::name_chain_from_path(path)?; let path = internal::path::path_from_name_chain(&names); let stream_id = match self.stream_id_for_name_chain(&names) { @@ -320,7 +412,7 @@ impl CompoundFile { if self.minialloc().dir_entry(stream_id).obj_type != ObjType::Stream { invalid_input!("Not a stream: {:?}", path); } - Ok(Stream::new(&self.minialloc, stream_id)) + Ok(Stream::new(&self.minialloc, stream_id, buffer_size)) } } @@ -858,7 +950,11 @@ impl CompoundFile { &mut self, path: P, ) -> io::Result> { - self.create_stream_with_path(path.as_ref(), true) + self.create_stream_with_path_and_buffer_size( + path.as_ref(), + true, + DEFAULT_STREAM_BUFFER_SIZE, + ) } /// Creates and returns a new, empty stream object at the provided path. @@ -868,13 +964,35 @@ impl CompoundFile { &mut self, path: P, ) -> io::Result> { - self.create_stream_with_path(path.as_ref(), false) + self.create_stream_with_path_and_buffer_size( + path.as_ref(), + false, + DEFAULT_STREAM_BUFFER_SIZE, + ) + } + + /// Creates and returns a new, empty stream object at the provided path, + /// using a custom per-stream buffer size. + /// + /// This is equivalent to `create_stream()`, except the returned `Stream` + /// uses `buffer_size` bytes for its internal read/write buffer. + pub fn create_stream_with_options>( + &mut self, + path: P, + options: CreateStreamOptions, + ) -> io::Result> { + self.create_stream_with_path_and_buffer_size( + path.as_ref(), + options.overwrite, + options.buffer_size, + ) } - fn create_stream_with_path( + fn create_stream_with_path_and_buffer_size( &mut self, path: &Path, overwrite: bool, + buffer_size: usize, ) -> io::Result> { let mut names = internal::path::name_chain_from_path(path)?; if let Some(stream_id) = self.stream_id_for_name_chain(&names) { @@ -893,7 +1011,8 @@ impl CompoundFile { internal::path::path_from_name_chain(&names) ); } else { - let mut stream = Stream::new(&self.minialloc, stream_id); + let mut stream = + Stream::new(&self.minialloc, stream_id, buffer_size); stream.set_len(0)?; return Ok(stream); } @@ -911,7 +1030,7 @@ impl CompoundFile { name, ObjType::Stream, )?; - Ok(Stream::new(&self.minialloc, new_stream_id)) + Ok(Stream::new(&self.minialloc, new_stream_id, buffer_size)) } /// Removes the stream object at the provided path.