diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0e77850..3f1a5c9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -29,6 +29,14 @@ jobs: - name: Build project run: cargo build --verbose + # Check the formatting + - name: Formatting check + run: cargo fmt --check + + # Check for code mistakes an possible improvements + - name: Clippy check + run: cargo clippy -- -D warnings + # Run the tests # This command executes all tests in the project - name: Run tests diff --git a/Cargo.lock b/Cargo.lock index bd848a3..8a2b214 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -472,7 +472,7 @@ checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", "synstructure", ] @@ -484,7 +484,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -623,7 +623,7 @@ checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -643,9 +643,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "axum" @@ -806,7 +806,7 @@ dependencies = [ "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.27", + "rustls 0.23.28", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -900,9 +900,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.26" +version = "1.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "956a5e21988b87f372569b66183b78babf23ebc2e744b733e4350a752c4dafac" +checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" dependencies = [ "jobserver", "libc", @@ -959,7 +959,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim 0.11.1", + "strsim", ] [[package]] @@ -971,7 +971,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -1176,38 +1176,14 @@ dependencies = [ "memchr", ] -[[package]] -name = "darling" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" -dependencies = [ - "darling_core 0.14.4", - "darling_macro 0.14.4", -] - [[package]] name = "darling" version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core 0.20.11", - "darling_macro 0.20.11", -] - -[[package]] -name = "darling_core" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn 1.0.109", + "darling_core", + "darling_macro", ] [[package]] @@ -1220,19 +1196,8 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.11.1", - "syn 2.0.102", -] - -[[package]] -name = "darling_macro" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" -dependencies = [ - "darling_core 0.14.4", - "quote", - "syn 1.0.109", + "strsim", + "syn", ] [[package]] @@ -1241,9 +1206,9 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core 0.20.11", + "darling_core", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -1287,7 +1252,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -1302,33 +1267,33 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.12.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.12.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling 0.14.4", + "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] name = "derive_builder_macro" -version = "0.12.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 1.0.109", + "syn", ] [[package]] @@ -1353,23 +1318,23 @@ dependencies = [ [[package]] name = "dirs" -version = "5.0.1" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" dependencies = [ "dirs-sys", ] [[package]] name = "dirs-sys" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.48.0", + "windows-sys 0.60.2", ] [[package]] @@ -1380,7 +1345,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -1400,6 +1365,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "dyn-clone" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" + [[package]] name = "either" version = "1.15.0" @@ -1682,7 +1653,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -1832,19 +1803,21 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hf-hub" -version = "0.3.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", + "http 1.3.1", "indicatif", + "libc", "log", - "native-tls", - "rand 0.8.5", + "rand 0.9.1", "serde", "serde_json", - "thiserror 1.0.69", + "thiserror 2.0.12", "ureq", + "windows-sys 0.60.2", ] [[package]] @@ -2015,7 +1988,7 @@ dependencies = [ "http 1.3.1", "hyper 1.6.0", "hyper-util", - "rustls 0.23.27", + "rustls 0.23.28", "rustls-pki-types", "tokio", "tokio-rustls 0.26.2", @@ -2515,7 +2488,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -2683,9 +2656,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -2819,9 +2792,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.172" +version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" [[package]] name = "libm" @@ -2837,7 +2810,7 @@ checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags 2.9.1", "libc", - "redox_syscall 0.5.12", + "redox_syscall 0.5.13", ] [[package]] @@ -2888,11 +2861,11 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "lz4_flex" -version = "0.11.3" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +checksum = "2c592ad9fbc1b7838633b3ae55ce69b17d01150c72fcef229fbb819d39ee51ee" dependencies = [ - "twox-hash", + "twox-hash 2.1.1", ] [[package]] @@ -2992,7 +2965,7 @@ checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -3019,7 +2992,7 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -3220,7 +3193,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -3308,7 +3281,7 @@ checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.12", + "redox_syscall 0.5.13", "smallvec", "windows-targets 0.52.6", ] @@ -3342,7 +3315,7 @@ dependencies = [ "simdutf8", "snap", "thrift", - "twox-hash", + "twox-hash 1.6.3", "zstd", ] @@ -3368,7 +3341,7 @@ dependencies = [ "regex", "regex-syntax 0.8.5", "structmeta", - "syn 2.0.102", + "syn", ] [[package]] @@ -3545,22 +3518,21 @@ dependencies = [ [[package]] name = "procfs" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +checksum = "cc5b72d8145275d844d4b5f6d4e1eef00c8cd889edb6035c21675d1bb1f45c9f" dependencies = [ "bitflags 2.9.1", "hex", - "lazy_static", "procfs-core", "rustix 0.38.44", ] [[package]] name = "procfs-core" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +checksum = "239df02d8349b06fc07398a3a1697b06418223b1c7725085e801e7c0fc6a12ec" dependencies = [ "bitflags 2.9.1", "hex", @@ -3568,9 +3540,9 @@ dependencies = [ [[package]] name = "prometheus" -version = "0.13.4" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +checksum = "3ca5326d8d0b950a9acd87e6a3f94745394f62e4dae1b1ee22b2bc0c394af43a" dependencies = [ "cfg-if", "fnv", @@ -3580,14 +3552,28 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror 1.0.69", + "thiserror 2.0.12", ] [[package]] name = "protobuf" -version = "2.28.0" +version = "3.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" +checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" +dependencies = [ + "once_cell", + "protobuf-support", + "thiserror 1.0.69", +] + +[[package]] +name = "protobuf-support" +version = "3.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" +dependencies = [ + "thiserror 1.0.69", +] [[package]] name = "quote" @@ -3600,9 +3586,9 @@ dependencies = [ [[package]] name = "r-efi" -version = "5.2.0" +version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "rand" @@ -3731,22 +3717,42 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.12" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" dependencies = [ "bitflags 2.9.1", ] [[package]] name = "redox_users" -version = "0.4.6" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" dependencies = [ "getrandom 0.2.16", "libredox", - "thiserror 1.0.69", + "thiserror 2.0.12", +] + +[[package]] +name = "ref-cast" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -3949,9 +3955,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.27" +version = "0.23.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321" +checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" dependencies = [ "log", "once_cell", @@ -3969,7 +3975,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70cc376c6ba1823ae229bacf8ad93c136d93524eab0e4e5e0e4f96b9c4e5b212" dependencies = [ "log", - "rustls 0.23.27", + "rustls 0.23.28", "rustls-native-certs 0.7.3", "rustls-pki-types", "rustls-webpki 0.103.3", @@ -4087,6 +4093,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -4179,7 +4197,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4212,7 +4230,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4229,15 +4247,16 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa" +checksum = "bf65a400f8f66fb7b0552869ad70157166676db75ed8181f8104ea91cf9d0b42" dependencies = [ "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", "indexmap 2.9.0", + "schemars", "serde", "serde_derive", "serde_json", @@ -4247,14 +4266,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" +checksum = "81679d9ed988d5e9a5e6531dc3f2c28efbd639cbd1dfb628df08edea6004da77" dependencies = [ - "darling 0.20.11", + "darling", "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4337,12 +4356,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "slab" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] +checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" [[package]] name = "smallvec" @@ -4376,6 +4392,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + [[package]] name = "spin" version = "0.9.8" @@ -4419,12 +4446,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - [[package]] name = "strsim" version = "0.11.1" @@ -4440,7 +4461,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.102", + "syn", ] [[package]] @@ -4451,7 +4472,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4462,20 +4483,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.102" +version = "2.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6397daf94fa90f058bd0fd88429dd9e5738999cca8d701813c80723add80462" +checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8" dependencies = [ "proc-macro2", "quote", @@ -4502,7 +4512,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4607,7 +4617,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4618,17 +4628,16 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] name = "thread_local" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" dependencies = [ "cfg-if", - "once_cell", ] [[package]] @@ -4704,18 +4713,17 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.15.2" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" +checksum = "3169b3195f925496c895caee7978a335d49218488ef22375267fba5a46a40bd7" dependencies = [ "aho-corasick", - "clap", "derive_builder", "esaxx-rs", "getrandom 0.2.16", "hf-hub", "indicatif", - "itertools 0.12.1", + "itertools 0.13.0", "lazy_static", "log", "macro_rules_attribute", @@ -4730,7 +4738,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror 1.0.69", + "thiserror 2.0.12", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -4773,7 +4781,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4806,7 +4814,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.27", + "rustls 0.23.28", "tokio", ] @@ -4903,13 +4911,13 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1ffbcf9c6f6b99d386e7444eb608ba646ae452a36b39737deb9663b610f662" +checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -4967,6 +4975,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" + [[package]] name = "typenum" version = "1.18.0" @@ -5027,12 +5041,12 @@ dependencies = [ "base64 0.22.1", "flate2", "log", - "native-tls", "once_cell", - "rustls 0.23.27", + "rustls 0.23.28", "rustls-pki-types", "serde", "serde_json", + "socks", "url", "webpki-roots 0.26.11", ] @@ -5137,7 +5151,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.102", + "syn", "wasm-bindgen-shared", ] @@ -5172,7 +5186,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5293,7 +5307,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -5304,14 +5318,14 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] name = "windows-link" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3bfe459f85da17560875b8bf1423d6f113b7a87a5d942e7da0ac71be7c61f8b" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" [[package]] name = "windows-result" @@ -5358,6 +5372,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -5382,13 +5405,29 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -5401,6 +5440,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -5413,6 +5458,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -5425,12 +5476,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -5443,6 +5506,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -5455,6 +5524,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -5467,6 +5542,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -5479,6 +5560,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winreg" version = "0.50.0" @@ -5589,7 +5676,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", "synstructure", ] @@ -5601,28 +5688,28 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.25" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.25" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -5642,7 +5729,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", "synstructure", ] @@ -5704,7 +5791,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] @@ -5715,7 +5802,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.102", + "syn", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6a610d5..bd431e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ clap = { version = "4.5.37", features = ["derive"] } lapin = { version = "2.5.3", features = ["native-tls"] } indicatif = { version = "0.17", features = ["tokio"] } axum = "0.7" -prometheus = { version = "0.13", features = ["process"] } +prometheus = { version = "0.14", features = ["process"] } lazy_static = "1.4" # Or use once_cell which is already present tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } @@ -39,7 +39,7 @@ chrono = "0.4.41" serde_yaml = "0.9.34" whatlang = "0.16.4" anyhow = "1.0.98" -tokenizers = { version = "0.15.2", features = ["http"] } # Specify version and add http feature +tokenizers = { version = "0.21", features = ["http"] } # Specify version and add http feature # {{ Define the library (implicitly done by having src/lib.rs) }} # [lib] diff --git a/config/pipeline_config.yaml b/config/pipeline_config.yaml index afe3e69..8fe9cc1 100644 --- a/config/pipeline_config.yaml +++ b/config/pipeline_config.yaml @@ -78,7 +78,7 @@ pipeline: short_line_length: 30 char_duplicates_ratio: 0.01 new_line_ratio: 0.3 - language: "da" + # language: "da" - type: TokenCounter tokenizer_name: "gpt2" \ No newline at end of file diff --git a/src/bin/producer.rs b/src/bin/producer.rs index 6e115ca..4fc0f45 100644 --- a/src/bin/producer.rs +++ b/src/bin/producer.rs @@ -16,8 +16,9 @@ use TextBlaster::data_model::{ProcessingOutcome, TextDocument}; // Import both T use TextBlaster::error::{PipelineError, Result}; // Use the library's Result type use TextBlaster::pipeline::readers::ParquetReader; use TextBlaster::pipeline::writers::parquet_writer::ParquetWriter; // For retry delay -use TextBlaster::utils::utils::{connect_rabbitmq, setup_prometheus_metrics}; // Updated for shared functions - // axum and TcpListener imports removed as they are now in utils::utils +use TextBlaster::utils::common::connect_rabbitmq; // Updated for shared functions +use TextBlaster::utils::prometheus_metrics::setup_prometheus_metrics; + use chrono::Utc; // For consumer tag in aggregate_results use TextBlaster::utils::prometheus_metrics::*; // Import shared metrics diff --git a/src/bin/worker.rs b/src/bin/worker.rs index 4303a5e..b4e7bfe 100644 --- a/src/bin/worker.rs +++ b/src/bin/worker.rs @@ -32,14 +32,14 @@ use TextBlaster::pipeline::filters::{ LanguageDetectionFilter, }; use TextBlaster::pipeline::token::TokenCounter; +use TextBlaster::utils::common::connect_rabbitmq; // Updated for shared functions use TextBlaster::utils::prometheus_metrics::*; -use TextBlaster::utils::utils::{connect_rabbitmq, setup_prometheus_metrics}; // Using shared setup_prometheus_metrics // Import shared metrics use std::path::PathBuf; use std::sync::Arc; // To share the executor across potential concurrent tasks // {{ Add serde_json for result serialization }} use tracing::{debug, error, info, info_span, instrument, warn}; // Added tracing -use tracing_appender; +use tracing_appender::{non_blocking, rolling}; use tracing_subscriber::{fmt, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer}; // Added tracing_subscriber // Added for file logging // Define command-line arguments @@ -152,7 +152,7 @@ fn build_pipeline_from_config(config: &PipelineConfig) -> Result Result<()> { let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); // Default to info if RUST_LOG is not set // Setup file logging - let file_appender = tracing_appender::rolling::daily("./log", "worker.log"); + let file_appender = rolling::daily("./log", "worker.log"); - let (non_blocking_file_writer, _guard) = tracing_appender::non_blocking(file_appender); + let (non_blocking_file_writer, _guard) = non_blocking(file_appender); // Configure the console layer let console_layer = fmt::layer() diff --git a/src/config.rs b/src/config.rs index 5ea5448..f3fc89f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -127,7 +127,7 @@ pub struct FineWebQualityFilterParams { pub short_line_length: usize, // serde will handle u64 -> usize if value fits pub char_duplicates_ratio: f64, pub new_line_ratio: f64, - pub language: String, + // pub language: String, } // Parameters for the TokenCounter diff --git a/src/error.rs b/src/error.rs index 2004fd8..07ee3ff 100644 --- a/src/error.rs +++ b/src/error.rs @@ -32,7 +32,7 @@ pub enum PipelineError { // {{MODIFIED: Changed 'doc_id: String' to 'document: TextDocument' and updated message}} #[error("Document '{document_id}' filtered out: {reason}", document_id = document.id)] DocumentFiltered { - document: TextDocument, + document: Box, reason: String, }, diff --git a/src/lib.rs b/src/lib.rs index 2b27d88..4a54396 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +#![allow(non_snake_case)] +#![allow(clippy::too_many_arguments)] + // Declare the modules that form the library's public API (or internal structure) // Using `pub mod` makes them accessible from the binaries using `use rust_data::module_name;` pub mod config; diff --git a/src/pipeline/filters/c4_filters.rs b/src/pipeline/filters/c4_filters.rs index a1955f0..e1ee629 100644 --- a/src/pipeline/filters/c4_filters.rs +++ b/src/pipeline/filters/c4_filters.rs @@ -176,7 +176,7 @@ impl ProcessingStep for C4QualityFilter { .metadata .insert("c4_filter_reasons".to_string(), reasons_string.clone()); return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: reasons_string, }); } @@ -262,7 +262,7 @@ impl ProcessingStep for C4QualityFilter { .metadata .insert("c4_filter_reasons".to_string(), reasons_string.clone()); Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: reasons_string, }) } else { @@ -469,7 +469,10 @@ impl ProcessingStep for C4BadWordsFilter { document .metadata .insert("c4_badwords_filter_reason".to_string(), reason.clone()); - return Err(PipelineError::DocumentFiltered { document, reason }); + return Err(PipelineError::DocumentFiltered { + document: Box::new(document), + reason, + }); } }; @@ -511,7 +514,10 @@ impl ProcessingStep for C4BadWordsFilter { document .metadata .insert("c4_badwords_filter_reason".to_string(), reason.clone()); - Err(PipelineError::DocumentFiltered { document, reason }) + Err(PipelineError::DocumentFiltered { + document: Box::new(document), + reason, + }) } } else { // No bad words found diff --git a/src/pipeline/filters/fineweb_quality.rs b/src/pipeline/filters/fineweb_quality.rs index acf3958..65edaf6 100644 --- a/src/pipeline/filters/fineweb_quality.rs +++ b/src/pipeline/filters/fineweb_quality.rs @@ -36,7 +36,7 @@ pub struct FineWebQualityFilter { short_line_length: usize, char_duplicates_ratio: f64, new_line_ratio: f64, - language: String, // Stored, but might not be actively used by utils::text::split_into_words + // language: String, // Stored, but might not be actively used by utils::text::split_into_words } impl FineWebQualityFilter { @@ -47,10 +47,10 @@ impl FineWebQualityFilter { short_line_length: usize, char_duplicates_ratio: f64, new_line_ratio: f64, - language: String, + // language: String, stop_chars: Option>, ) -> Self { - let sc = stop_chars.unwrap_or_else(|| default_stop_chars().iter().map(|&s| s).collect()); + let sc = stop_chars.unwrap_or_else(|| default_stop_chars().iter().copied().collect()); FineWebQualityFilter { line_punct_thr, line_punct_exclude_zero, @@ -59,7 +59,7 @@ impl FineWebQualityFilter { short_line_length, char_duplicates_ratio, new_line_ratio, - language, + // language, } } } @@ -79,7 +79,7 @@ impl ProcessingStep for FineWebQualityFilter { if lines.is_empty() { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: "empty".to_string(), }); } @@ -102,7 +102,7 @@ impl ProcessingStep for FineWebQualityFilter { && !(line_punct_actual_ratio == 0.0 && self.line_punct_exclude_zero) { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: format!( "line_punct_ratio: {:.4} < threshold {:.4} (exclude_zero: {})", line_punct_actual_ratio, self.line_punct_thr, self.line_punct_exclude_zero @@ -118,7 +118,7 @@ impl ProcessingStep for FineWebQualityFilter { let short_line_actual_ratio = short_lines_count as f64 / lines.len() as f64; if short_line_actual_ratio > self.short_line_thr { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: format!( "short_line_ratio: {:.4} > threshold {:.4}", short_line_actual_ratio, self.short_line_thr @@ -136,7 +136,7 @@ impl ProcessingStep for FineWebQualityFilter { }; if char_dup_actual_ratio > self.char_duplicates_ratio { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: format!( "char_dup_ratio: {:.4} > threshold {:.4}", char_dup_actual_ratio, self.char_duplicates_ratio @@ -151,7 +151,7 @@ impl ProcessingStep for FineWebQualityFilter { if words.is_empty() { if new_line_count > 0 { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: "list_ratio_no_words (newlines present but no words)".to_string(), }); } @@ -159,7 +159,7 @@ impl ProcessingStep for FineWebQualityFilter { let list_actual_ratio = new_line_count as f64 / words.len() as f64; if list_actual_ratio > self.new_line_ratio { return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: format!( "list_ratio: {:.4} > threshold {:.4}", list_actual_ratio, self.new_line_ratio @@ -183,7 +183,7 @@ mod tests { const DEFAULT_SHORT_LINE_THR: f64 = 0.67; const DEFAULT_SHORT_LINE_LENGTH: usize = 30; const DEFAULT_NEW_LINE_RATIO: f64 = 0.3; - const DEFAULT_LANGUAGE: &str = "english"; // Currently not used by split_into_words if it's new_auto() + // const DEFAULT_LANGUAGE: &str = "english"; // Currently not used by split_into_words if it's new_auto() // Helper to create a default filter instance for tests fn default_filter() -> FineWebQualityFilter { @@ -195,7 +195,7 @@ mod tests { short_line_length: DEFAULT_SHORT_LINE_LENGTH, char_duplicates_ratio: 0.95, // Temporarily very high to isolate other test failures new_line_ratio: DEFAULT_NEW_LINE_RATIO, - language: DEFAULT_LANGUAGE.to_string(), + // language: DEFAULT_LANGUAGE.to_string(), } } diff --git a/src/pipeline/filters/gopher_quality.rs b/src/pipeline/filters/gopher_quality.rs index b7d095d..ad5c27d 100644 --- a/src/pipeline/filters/gopher_quality.rs +++ b/src/pipeline/filters/gopher_quality.rs @@ -306,7 +306,7 @@ impl ProcessingStep for GopherQualityFilter { reasons_string.clone(), ); Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: reasons_string, }) } else { diff --git a/src/pipeline/filters/gopher_rep.rs b/src/pipeline/filters/gopher_rep.rs index 2f42940..a8a6589 100644 --- a/src/pipeline/filters/gopher_rep.rs +++ b/src/pipeline/filters/gopher_rep.rs @@ -238,7 +238,7 @@ impl ProcessingStep for GopherRepetitionFilter { ); return Err(PipelineError::DocumentFiltered { - document, + document: Box::new(document), reason: reasons_string, }); } diff --git a/src/pipeline/filters/language_filter.rs b/src/pipeline/filters/language_filter.rs index 620a0c2..f69d366 100644 --- a/src/pipeline/filters/language_filter.rs +++ b/src/pipeline/filters/language_filter.rs @@ -46,13 +46,19 @@ impl ProcessingStep for LanguageDetectionFilter { "Document is not any of the following languages: {}", self.allowed_languages.join("; ") ); - Err(PipelineError::DocumentFiltered { document, reason }) + Err(PipelineError::DocumentFiltered { + document: Box::new(document), + reason, + }) } else if confidence < self.min_confidence { let reason = format!( "Language detection confidence is not satified: {} < {}", confidence, self.min_confidence ); - Err(PipelineError::DocumentFiltered { document, reason }) + Err(PipelineError::DocumentFiltered { + document: Box::new(document), + reason, + }) } else { Ok(document) } diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index e8f5fe2..b364380 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -2,8 +2,8 @@ pub mod filters; pub mod readers; -pub mod writers; -pub mod token; // Added new module +pub mod token; +pub mod writers; // Added new module // Potentially other pipeline related items, traits, etc. // e.g., pub trait PipelineStep {} diff --git a/src/pipeline/writers/parquet_writer.rs b/src/pipeline/writers/parquet_writer.rs index 5e4ab42..0b8487d 100644 --- a/src/pipeline/writers/parquet_writer.rs +++ b/src/pipeline/writers/parquet_writer.rs @@ -10,7 +10,7 @@ use std::sync::Arc; // For serializing metadata // Assuming your error module and Result type are defined like this // Adjust the import path if necessary use crate::data_model::TextDocument; -use crate::error::Result; +use crate::error::{PipelineError, Result}; // Define the schema for TextDocument fn create_schema() -> SchemaRef { @@ -59,7 +59,9 @@ impl ParquetWriter { if self.writer.is_none() { // Handle error: writer was already closed or failed to initialize // You might want a more specific error type here - return Err(std::io::Error::new(std::io::ErrorKind::Other, "Writer is closed").into()); + return Err(PipelineError::Unexpected( + "The parquet writer was already closed".to_string(), + )); } // 1. Convert TextDocuments to Arrow Arrays diff --git a/src/utils/common.rs b/src/utils/common.rs new file mode 100644 index 0000000..dd910d4 --- /dev/null +++ b/src/utils/common.rs @@ -0,0 +1,35 @@ +// src/utils/utils.rs + +use lapin::{Connection, ConnectionProperties, Result as LapinResult}; +use std::time::Duration; +use tokio::time::sleep; +use tracing::{error, info}; + +// Helper function to connect to RabbitMQ with retry (already here) +pub async fn connect_rabbitmq(addr: &str) -> LapinResult { + let options = ConnectionProperties::default() + .with_executor(tokio_executor_trait::Tokio::current()) + .with_reactor(tokio_reactor_trait::Tokio); + + let mut attempts = 0; + loop { + match Connection::connect(addr, options.clone()).await { + Ok(conn) => { + info!("Successfully connected to RabbitMQ at {}", addr); + return Ok(conn); + } + Err(e) => { + attempts += 1; + error!( + attempt = attempts, + error = %e, + "Failed to connect to RabbitMQ. Retrying in 5 seconds..." + ); + if attempts >= 5 { + return Err(e); + } + sleep(Duration::from_secs(5)).await; + } + } + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 89723d9..c227a9e 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,8 +1,8 @@ // Utils +pub mod common; pub mod prometheus_metrics; -pub mod text; -pub mod utils; // Added to declare the utils.rs module // Added to declare the prometheus_metrics.rs module +pub mod text; // Added to declare the common.rs module pub use text::{ find_all_duplicate, find_duplicates, find_top_duplicate, get_n_grams, split_into_sentences, diff --git a/src/utils/prometheus_metrics.rs b/src/utils/prometheus_metrics.rs index ee0c2b9..f5a4bb9 100644 --- a/src/utils/prometheus_metrics.rs +++ b/src/utils/prometheus_metrics.rs @@ -1,7 +1,16 @@ // src/utils/prometheus_metrics.rs +use tracing::{error, info}; + +use crate::error::Result; +use axum::{http::StatusCode, routing::get, serve, Router}; +use tokio::net::TcpListener; // Assuming Result is crate::error::Result for PipelineError + use once_cell::sync::Lazy; -use prometheus::{register_counter, register_gauge, register_histogram, Counter, Gauge, Histogram}; +use prometheus::{ + gather, register_counter, register_gauge, register_histogram, Counter, Encoder, Gauge, + Histogram, TextEncoder, +}; // Metrics from Producer pub static TASKS_PUBLISHED_TOTAL: Lazy = Lazy::new(|| { @@ -132,3 +141,61 @@ pub static ACTIVE_PROCESSING_TASKS: Lazy = Lazy::new(|| { ) .expect("Failed to register worker_active_processing_tasks gauge") }); + +// Axum handler for /metrics +// This function needs to be async as it's used as an Axum handler. +// It's kept non-public as it's an internal detail of setup_prometheus_metrics. +async fn metrics_handler() -> (StatusCode, String) { + let encoder = TextEncoder::new(); + let mut buffer = vec![]; + if let Err(e) = encoder.encode(&gather(), &mut buffer) { + error!("Could not encode prometheus metrics: {}", e); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Could not encode prometheus metrics: {}", e), + ); + } + match String::from_utf8(buffer) { + Ok(s) => (StatusCode::OK, s), + Err(e) => { + error!("Prometheus metrics UTF-8 error: {}", e); + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Prometheus metrics UTF-8 error: {}", e), + ) + } + } +} + +// Function to setup Prometheus metrics endpoint +// This function is now public to be used by binaries. +pub async fn setup_prometheus_metrics(metrics_port: Option) -> Result<()> { + if let Some(port) = metrics_port { + let app = Router::new().route("/metrics", get(metrics_handler)); + let listener_addr = format!("0.0.0.0:{}", port); + info!( + "Metrics endpoint will be available at http://{}/metrics", + listener_addr + ); + + tokio::spawn(async move { + match TcpListener::bind(&listener_addr).await { + Ok(listener) => { + if let Err(e) = serve(listener, app).await { + error!("Metrics server error: {}", e); + } + } + Err(e) => { + error!("Failed to bind metrics server to {}: {}", listener_addr, e); + // This error is within a spawned task, so we can't directly bubble it up + // with `?` from setup_prometheus_metrics. Logging is appropriate. + // Consider if this should return an error that prevents server startup. + } + } + }); + Ok(()) + } else { + info!("Prometheus metrics endpoint not configured (no port specified)."); + Ok(()) + } +} diff --git a/src/utils/text.rs b/src/utils/text.rs index 6307e23..802ded4 100644 --- a/src/utils/text.rs +++ b/src/utils/text.rs @@ -110,9 +110,10 @@ pub fn split_into_words(text: &str) -> Vec<&str> { // segment_str() returns an iterator over break points (usize byte offsets). // A segment is the text between two consecutive break points. - let mut breaks_iter = segmenter.segment_str(text).peekable(); + let breaks_iter = segmenter.segment_str(text).peekable(); - while let Some(current_break) = breaks_iter.next() { + for current_break in breaks_iter { + // while let Some(current_break) = breaks_iter.next() { if current_break > prev_break { let segment_candidate = &text[prev_break..current_break]; diff --git a/src/utils/utils.rs b/src/utils/utils.rs deleted file mode 100644 index ef2e74d..0000000 --- a/src/utils/utils.rs +++ /dev/null @@ -1,98 +0,0 @@ -// src/utils/utils.rs - -use lapin::{Connection, ConnectionProperties, Result as LapinResult}; -use std::time::Duration; -use tokio::time::sleep; -use tracing::{error, info}; - -use crate::error::Result; -use axum::{http::StatusCode, routing::get, serve, Router}; -use prometheus::{gather, Encoder, TextEncoder}; -use tokio::net::TcpListener; // Assuming Result is crate::error::Result for PipelineError - -// Helper function to connect to RabbitMQ with retry (already here) -pub async fn connect_rabbitmq(addr: &str) -> LapinResult { - let options = ConnectionProperties::default() - .with_executor(tokio_executor_trait::Tokio::current()) - .with_reactor(tokio_reactor_trait::Tokio); - - let mut attempts = 0; - loop { - match Connection::connect(addr, options.clone()).await { - Ok(conn) => { - info!("Successfully connected to RabbitMQ at {}", addr); - return Ok(conn); - } - Err(e) => { - attempts += 1; - error!( - attempt = attempts, - error = %e, - "Failed to connect to RabbitMQ. Retrying in 5 seconds..." - ); - if attempts >= 5 { - return Err(e); - } - sleep(Duration::from_secs(5)).await; - } - } - } -} - -// Axum handler for /metrics -// This function needs to be async as it's used as an Axum handler. -// It's kept non-public as it's an internal detail of setup_prometheus_metrics. -async fn metrics_handler() -> (StatusCode, String) { - let encoder = TextEncoder::new(); - let mut buffer = vec![]; - if let Err(e) = encoder.encode(&gather(), &mut buffer) { - error!("Could not encode prometheus metrics: {}", e); - return ( - StatusCode::INTERNAL_SERVER_ERROR, - format!("Could not encode prometheus metrics: {}", e), - ); - } - match String::from_utf8(buffer) { - Ok(s) => (StatusCode::OK, s), - Err(e) => { - error!("Prometheus metrics UTF-8 error: {}", e); - ( - StatusCode::INTERNAL_SERVER_ERROR, - format!("Prometheus metrics UTF-8 error: {}", e), - ) - } - } -} - -// Function to setup Prometheus metrics endpoint -// This function is now public to be used by binaries. -pub async fn setup_prometheus_metrics(metrics_port: Option) -> Result<()> { - if let Some(port) = metrics_port { - let app = Router::new().route("/metrics", get(metrics_handler)); - let listener_addr = format!("0.0.0.0:{}", port); - info!( - "Metrics endpoint will be available at http://{}/metrics", - listener_addr - ); - - tokio::spawn(async move { - match TcpListener::bind(&listener_addr).await { - Ok(listener) => { - if let Err(e) = serve(listener, app).await { - error!("Metrics server error: {}", e); - } - } - Err(e) => { - error!("Failed to bind metrics server to {}: {}", listener_addr, e); - // This error is within a spawned task, so we can't directly bubble it up - // with `?` from setup_prometheus_metrics. Logging is appropriate. - // Consider if this should return an error that prevents server startup. - } - } - }); - Ok(()) - } else { - info!("Prometheus metrics endpoint not configured (no port specified)."); - Ok(()) - } -}