diff --git a/.github/workflows/release-python.yaml b/.github/workflows/release-python.yaml index a8d11a7..159de53 100644 --- a/.github/workflows/release-python.yaml +++ b/.github/workflows/release-python.yaml @@ -16,14 +16,14 @@ jobs: manylinux: auto command: build # container default is manylinux - args: --release -o dist --manifest-path crates/stringmetrics_py/Cargo.toml + args: --release -o dist --manifest-path stringmetrics-py/Cargo.toml - name: Build musl wheels uses: messense/maturin-action@v1 with: target: x86_64-unknown-linux-musl manylinux: musllinux_1_1 command: build - args: --release -o dist -i 3.7 3.8 3.9 3.10 --manifest-path crates/stringmetrics_py/Cargo.toml + args: --release -o dist -i 3.7 3.8 3.9 3.10 --manifest-path stringmetrics-py/Cargo.toml - name: Upload wheels uses: actions/upload-artifact@v2 with: @@ -37,7 +37,7 @@ jobs: - uses: messense/maturin-action@v1 with: command: build - args: --release --no-sdist -o dist --manifest-path crates/stringmetrics_py/Cargo.toml + args: --release --no-sdist -o dist --manifest-path stringmetrics-py/Cargo.toml - name: Upload wheels uses: actions/upload-artifact@v2 with: @@ -51,7 +51,7 @@ jobs: - uses: messense/maturin-action@v1 with: command: build - args: --release --no-sdist -o dist --universal2 --manifest-path crates/stringmetrics_py/Cargo.toml + args: --release --no-sdist -o dist --universal2 --manifest-path stringmetrics-py/Cargo.toml - name: Upload wheels uses: actions/upload-artifact@v2 with: diff --git a/stringmetrics/src/algorithms.rs b/stringmetrics/src/algorithms.rs index 8854a57..2ce3602 100644 --- a/stringmetrics/src/algorithms.rs +++ b/stringmetrics/src/algorithms.rs @@ -17,16 +17,21 @@ //! assert_eq!(levenshtein(a, b), 6); //! ``` +mod damerau_impl; mod hamming_impl; -// mod damerau; mod jaccard_impl; mod lev_impl; +mod osa_impl; +pub use self::damerau_impl::DamerauWeights; pub use self::hamming_impl::{hamming, hamming_iter}; -// pub use self::damerau::damerau_levenshtein; pub use self::jaccard_impl::{jaccard, jaccard_set}; pub use self::lev_impl::{ levenshtein, levenshtein_limit, levenshtein_limit_iter, levenshtein_weight, levenshtein_weight_iter, try_levenshtein, try_levenshtein_iter, try_levenshtein_weight, try_levenshtein_weight_iter, LevWeights, }; +pub use self::osa_impl::{ + osa_distance, osa_limit, osa_limit_iter, osa_weight, osa_weight_iter, try_osa, try_osa_iter, + try_osa_weight, try_osa_weight_iter, +}; diff --git a/stringmetrics/src/algorithms/damerau_impl.rs b/stringmetrics/src/algorithms/damerau_impl.rs index 392d0b3..d40782c 100644 --- a/stringmetrics/src/algorithms/damerau_impl.rs +++ b/stringmetrics/src/algorithms/damerau_impl.rs @@ -1,4 +1,43 @@ -// Using the "optimal string alignment distance" from wikipedia -pub fn damerau_levenshtein(_a: &str, _b: &str) -> u32 { - 0 +use crate::algorithms::lev_impl::WeightsSwap; +use std::mem; + +/// A struct that holds +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct DamerauWeights { + pub insertion: u32, + pub deletion: u32, + pub substitution: u32, + pub transposition: u32, +} + +impl DamerauWeights { + /// Create a new `DamerauWeights` object + #[inline] + pub const fn new(w_ins: u32, w_del: u32, w_sub: u32, w_tspn: u32) -> Self { + Self { + insertion: w_ins, + deletion: w_del, + substitution: w_sub, + transposition: w_tspn, + } + } +} + +impl WeightsSwap for DamerauWeights { + // Swap insertion and deletion terms + #[inline] + fn swap(&mut self) { + mem::swap(&mut self.insertion, &mut self.deletion); + } +} + +impl Default for DamerauWeights { + fn default() -> Self { + Self { + insertion: 1, + deletion: 1, + substitution: 1, + transposition: 1, + } + } } diff --git a/stringmetrics/src/algorithms/lev_impl.rs b/stringmetrics/src/algorithms/lev_impl.rs index c799c25..5792abe 100644 --- a/stringmetrics/src/algorithms/lev_impl.rs +++ b/stringmetrics/src/algorithms/lev_impl.rs @@ -18,7 +18,7 @@ pub use structures::*; /// better to use [`levenshtein_limit`] to avoid unnecessary computation. /// /// Behind the scenes, this wraps [`levenshtein_limit_iter`]. For details on -/// operation, see the [algorithms](crate::algorithms) page. +/// operation, see the [module-level documentation](crate). /// /// # Example /// @@ -34,7 +34,7 @@ pub use structures::*; /// if you need that functionality, please use [`levenshtein_weight`]. #[inline] pub fn levenshtein(a: &str, b: &str) -> u32 { - levenshtein_limit_iter(a.bytes(), b.bytes(), u32::MAX) + try_levenshtein_iter(a.bytes(), b.bytes(), u32::MAX).unwrap_or(u32::MAX) } /// Levenshtein distance computation with a limit @@ -56,7 +56,7 @@ pub fn levenshtein(a: &str, b: &str) -> u32 { /// ``` #[inline] pub fn levenshtein_limit(a: &str, b: &str, limit: u32) -> u32 { - levenshtein_limit_iter(a.bytes(), b.bytes(), limit) + try_levenshtein_iter(a.bytes(), b.bytes(), limit).unwrap_or(limit) } /// The same alrogithm as [`levenshtein_limit`] but return an `Option` to diff --git a/stringmetrics/src/algorithms/lev_impl/implementation.rs b/stringmetrics/src/algorithms/lev_impl/implementation.rs index 9936a72..faf6ea8 100644 --- a/stringmetrics/src/algorithms/lev_impl/implementation.rs +++ b/stringmetrics/src/algorithms/lev_impl/implementation.rs @@ -3,7 +3,7 @@ use super::{LevState, LevWeights}; use std::cmp::min; -/// The same algorithm as [`levenshtein_limit_iter`] but return an `Option` to +/// The same algorithm as [`levenshtein_limit_iter`](crate::levenshtein_limit_iter) but return an `Option` to /// indicate if the limit is exceeded /// /// Returns `Some(u32)` if a distance is found, `None` if a limit is hit @@ -23,13 +23,14 @@ where D: DoubleEndedIterator + Clone, T: PartialEq, { - // Identical implementation to levenshtein_weight_iter, just avoiding + // Identical implementation to levenshtein_weight_iter, just saving some ops + // from the weight calculations let state = LevState::new(a.into_iter(), b.into_iter()); let LevState { a_iter, b_iter, - a_diff_len: a_len, - b_diff_len: b_len, + a_len, + b_len, } = state; // Only check b_len because if a_len is 0, the loop won't happen @@ -132,8 +133,8 @@ where let LevState { a_iter, b_iter, - a_diff_len: a_len, - b_diff_len: b_len, + a_len, + b_len, } = state; let LevWeights { insertion: w_ins, diff --git a/stringmetrics/src/algorithms/lev_impl/structures.rs b/stringmetrics/src/algorithms/lev_impl/structures.rs index bcfa79d..be947fe 100644 --- a/stringmetrics/src/algorithms/lev_impl/structures.rs +++ b/stringmetrics/src/algorithms/lev_impl/structures.rs @@ -2,6 +2,11 @@ use crate::iter::find_eq_end_items; use std::iter::Skip; use std::mem; +/// +pub trait WeightsSwap { + fn swap(&mut self); +} + /// A struct that holds the costs of insertion, deletion, and substitution. Used /// for levenshthein algorithms that require weight specifications. #[derive(Debug, PartialEq, Eq, Clone)] @@ -12,6 +17,7 @@ pub struct LevWeights { } impl LevWeights { + /// Create a new `LevWeights` object #[inline] pub const fn new(w_ins: u32, w_del: u32, w_sub: u32) -> Self { Self { @@ -20,10 +26,11 @@ impl LevWeights { substitution: w_sub, } } - +} +impl WeightsSwap for LevWeights { // Swap insertion and deletion terms #[inline] - pub fn swap(&mut self) { + fn swap(&mut self) { mem::swap(&mut self.insertion, &mut self.deletion); } } @@ -35,12 +42,14 @@ impl Default for LevWeights { } } +/// Representation of a string for lev parsing after stipping start & end #[derive(Debug)] pub struct LevState { pub a_iter: Skip, pub b_iter: Skip, - pub a_diff_len: u32, - pub b_diff_len: u32, + /// Lengths after trimming + pub a_len: u32, + pub b_len: u32, } impl + Clone, T: PartialEq> LevState { @@ -50,8 +59,8 @@ impl + Clone, T: PartialEq> LevState { Self { a_iter: a_iter.skip(skip), b_iter: b_iter.skip(skip), - a_diff_len: iter_info.a_diff_len(), - b_diff_len: iter_info.b_diff_len(), + a_len: iter_info.a_diff_len(), + b_len: iter_info.b_diff_len(), } } @@ -67,7 +76,7 @@ impl + Clone, T: PartialEq> LevState { /// Create a new structure and swap weights if needed #[inline] - pub fn new_weights(a_iter: D, b_iter: D, weights: &mut LevWeights) -> Self { + pub fn new_weights(a_iter: D, b_iter: D, weights: &mut W) -> Self { let mut ret = Self::new_inner(a_iter, b_iter); if ret.should_swap() { ret.swap_inner(); @@ -79,12 +88,12 @@ impl + Clone, T: PartialEq> LevState { /// We want the longer string in B so it's in the inner loop #[inline] pub const fn should_swap(&self) -> bool { - self.a_diff_len > self.b_diff_len + self.a_len > self.b_len } #[inline] pub fn swap_inner(&mut self) { mem::swap(&mut self.a_iter, &mut self.b_iter); - mem::swap(&mut self.a_diff_len, &mut self.b_diff_len); + mem::swap(&mut self.a_len, &mut self.b_len); } } diff --git a/stringmetrics/src/algorithms/lev_impl/tests.rs b/stringmetrics/src/algorithms/lev_impl/tests.rs index e996c2a..8675983 100644 --- a/stringmetrics/src/algorithms/lev_impl/tests.rs +++ b/stringmetrics/src/algorithms/lev_impl/tests.rs @@ -17,8 +17,8 @@ fn test_levstate_new() { let a = "aaxxxxxc"; let b = "aaabbbccc"; let state = LevState::new(a.bytes(), b.bytes()); - assert_eq!(state.a_diff_len, 5); - assert_eq!(state.b_diff_len, 6); + assert_eq!(state.a_len, 5); + assert_eq!(state.b_len, 6); } #[test] @@ -35,6 +35,7 @@ fn test_levenshtein_empty() { #[test] fn test_levenshtein_basic() { + assert_eq!(levenshtein("ab", "ba"), 2); assert_eq!(levenshtein("abcd", "ab"), 2); assert_eq!(levenshtein("ab", "abcd"), 2); assert_eq!(levenshtein("abcd", "ad"), 2); diff --git a/stringmetrics/src/algorithms/osa_impl.rs b/stringmetrics/src/algorithms/osa_impl.rs new file mode 100644 index 0000000..38ff5a4 --- /dev/null +++ b/stringmetrics/src/algorithms/osa_impl.rs @@ -0,0 +1,53 @@ +mod implementation; +pub use implementation::*; + +use crate::DamerauWeights; + +#[inline] +pub fn osa_distance(a: &str, b: &str) -> u32 { + // try_osa_iter(a.bytes(), b.bytes(), u32::MAX).unwrap_or(u32::MAX) + try_osa_weight_iter(a.bytes(), b.bytes(), u32::MAX, &DamerauWeights::default()) + .unwrap_or(u32::MAX) +} + +#[inline] +pub fn osa_limit(a: &str, b: &str, limit: u32) -> u32 { + try_osa_iter(a.bytes(), b.bytes(), limit).unwrap_or(limit) +} + +#[inline] +pub fn osa_limit_iter(a: I, b: I, limit: u32) -> u32 +where + I: IntoIterator, + D: DoubleEndedIterator + Clone, + T: PartialEq + Clone, +{ + try_osa_iter(a, b, limit).unwrap_or(limit) +} + +#[inline] +pub fn try_osa(a: &str, b: &str, limit: u32) -> Option { + try_osa_iter(a.bytes(), b.bytes(), limit) +} + +#[inline] +pub fn osa_weight(a: &str, b: &str, limit: u32, weights: &DamerauWeights) -> u32 { + try_osa_weight_iter(a.bytes(), b.bytes(), limit, weights).unwrap_or(limit) +} +#[inline] +pub fn try_osa_weight(a: &str, b: &str, limit: u32, weights: &DamerauWeights) -> Option { + try_osa_weight_iter(a.bytes(), b.bytes(), limit, weights) +} + +#[inline] +pub fn osa_weight_iter(a: I, b: I, limit: u32, weights: &DamerauWeights) -> u32 +where + I: IntoIterator, + D: DoubleEndedIterator + Clone, + T: PartialEq + Clone, +{ + try_osa_weight_iter(a, b, limit, weights).unwrap_or(limit) +} + +#[cfg(test)] +mod tests; diff --git a/stringmetrics/src/algorithms/osa_impl/implementation.rs b/stringmetrics/src/algorithms/osa_impl/implementation.rs new file mode 100644 index 0000000..112f060 --- /dev/null +++ b/stringmetrics/src/algorithms/osa_impl/implementation.rs @@ -0,0 +1,226 @@ +use crate::algorithms::lev_impl::LevState; +use crate::DamerauWeights; +use std::cmp::min; +use std::mem; + +#[inline] +pub fn try_osa_iter(a: I, b: I, limit: u32) -> Option +where + I: IntoIterator, + D: DoubleEndedIterator + Clone, + T: PartialEq + Clone, +{ + let state = LevState::new(a.into_iter(), b.into_iter()); + let LevState { + a_iter, + b_iter, + a_len, + b_len, + } = state; + + // Only check b_len because if a_len is 0, the loop won't happen + if b_len == 0 { + return Some(min(a_len, limit)); + } + + if b_len - a_len > limit { + return None; + } + if b_len - a_len >= limit { + return Some(limit); + } + + let mut last_cache: Vec = (1..=b_len).collect(); + let mut cache: Vec = vec![0; b_len as usize]; + let mut tmp_res = b_len; + let mut last_a: Option = None; + let mut last_b: Option = None; + + for (i, a_item) in a_iter.enumerate().take_while(|&(i, _)| i < a_len as usize) { + // Our "horizontal" iterations always start with the leftmost column, + // which is the insertion cost (or substitution above) + // temp_res is also our insertion cost base + let mut sub_base = i as u32; + tmp_res = sub_base + 1; + + // eprintln!("{i} {last_cache:?}"); + // eprint!("{tmp_res} "); + + // Go through and do our calculations. we need to preserve the "up left" + // (sub_base) and "left" (tmp_res) values, the rest can be overwritten + for (j, b_item) in b_iter + .clone() + .enumerate() + .take_while(|&(j, _)| j < b_len as usize) + { + let del_base = last_cache[j]; + + // Insertion costs and deletion costs are their bases + 1 + // i.e., the value to the left or above plus 1 + // Substitution cost is equal to the up-left (sub_base) cost if equal, + // otherwise up-left value + 1. + if a_item == b_item { + tmp_res = min(min(tmp_res, del_base) + 1, sub_base); + } else { + tmp_res = min(min(tmp_res, del_base), sub_base) + 1; + } + + // SAFETY: if we have gone through the loop once, these have values + if i > 0 + && j > 0 + && unsafe { a_item == last_b.clone().unwrap_unchecked() } + && unsafe { b_item == last_a.clone().unwrap_unchecked() } + { + // Evaluate transpose cost + tmp_res = min(tmp_res, last_cache[j - 1]); + } + + // As we shift to the right, our deletion square becomes our + // substitution square + sub_base = del_base; + + // Save our insertion cost for the next iteration + cache[j] = tmp_res; + + last_b = Some(b_item); + } + // eprintln!("{:?}\n", cache); + + if tmp_res > limit { + return None; + } + + last_a = Some(a_item); + mem::swap(&mut last_cache, &mut cache); + } + + Some(tmp_res) +} + +#[inline] +pub fn try_osa_weight_iter(a: I, b: I, limit: u32, weights: &DamerauWeights) -> Option +where + I: IntoIterator, + D: DoubleEndedIterator + Clone, + T: PartialEq + Clone, +{ + let mut weights = weights.clone(); + let state = LevState::new_weights(a.into_iter(), b.into_iter(), &mut weights); + let LevState { + a_iter, + b_iter, + a_len, + b_len, + } = state; + let DamerauWeights { + insertion: w_ins, + deletion: w_del, + substitution: w_sub, + transposition: w_tspn, + } = weights; + + // Only check b_len because if a_len is 0, the loop won't happen + if b_len == 0 { + return Some(min(a_len * w_del, limit)); + } + + if b_len - a_len > limit { + return None; + } + + if b_len - a_len >= limit { + return Some(limit); + } + + let equal_weights = w_ins == w_del && w_del == w_sub && w_sub == w_tspn; + + let mut last_cache: Vec = (w_ins..=(b_len * w_ins)).step_by(w_ins as usize).collect(); + dbg!(&last_cache); + let mut cache: Vec = vec![0; b_len as usize]; + let mut tmp_res = b_len * w_ins; + let mut tspn_base = [0u32; 2]; // This stores the leftmost moving column + let mut sub_base: u32; + let mut last_a: Option = None; + let mut last_b: Option = None; + + for (i, a_item) in a_iter.enumerate().take_while(|&(i, _)| i < a_len as usize) { + // Our "horizontal" iterations always start with the leftmost column, + // which is the insertion cost (or substitution above) + // temp_res is also our insertion cost base + sub_base = i as u32 * w_del; + tmp_res = sub_base + w_del; + tspn_base.swap(0, 1); + tspn_base[1] = sub_base; + + // dbg!(&(tspn_base,sub_base,tmp_res)); + + // Go through and do our calculations. we need to preserve the "up left" + // (sub_base) and "left" (tmp_res) values, the rest can be overwritten + for (j, b_item) in b_iter + .clone() + .enumerate() + .take_while(|&(j, _)| j < b_len as usize) + { + // dbg!((i, j, a_item==b_item)); + let del_base = last_cache[j]; + + // Insertion costs and deletion costs are their bases + 1 + // i.e., the value to the left or above plus 1 + // Substitution cost is equal to the up-left (sub_base) cost if equal, + // otherwise up-left value + 1. + if equal_weights { + if a_item == b_item { + tmp_res = min(min(tmp_res, del_base) + w_ins, sub_base); + } else { + tmp_res = min(min(tmp_res, del_base), sub_base) + w_ins; + } + } else if a_item == b_item { + tmp_res = min(min(tmp_res + w_ins, del_base + w_del), sub_base); + } else { + tmp_res = min(min(tmp_res + w_ins, del_base + w_del), sub_base + w_sub); + } + + // SAFETY: if we have gone through the loop once, these have values + if i > 0 + && j > 0 + && unsafe { a_item == last_b.clone().unwrap_unchecked() } + && unsafe { b_item == last_a.clone().unwrap_unchecked() } + { + // dbg!("match"); + let tspn_cost = dbg!( + w_tspn + + if j == 1 { + tspn_base[0] + } else { + last_cache[j - 1] + } + ); + // Evaluate transpose cost + tmp_res = min(tmp_res, tspn_cost); + } + + // As we shift to the right, our deletion square becomes our + // substitution square + sub_base = del_base; + + // Save our insertion cost for the next iteration + // tspn_base.swap(0, 1); + // tspn_base[1] = + cache[j] = tmp_res; + + last_b = Some(b_item); + } + + if tmp_res > limit.saturating_add(w_ins) { + return None; + } + + last_a = Some(a_item); + mem::swap(&mut last_cache, &mut cache); + } + + if tmp_res > limit { + return None; + } + Some(tmp_res) +} diff --git a/stringmetrics/src/algorithms/osa_impl/tests.rs b/stringmetrics/src/algorithms/osa_impl/tests.rs new file mode 100644 index 0000000..98e6df3 --- /dev/null +++ b/stringmetrics/src/algorithms/osa_impl/tests.rs @@ -0,0 +1,135 @@ +use super::*; + +// #[test] +// fn test_levweights_swap() { +// let mut w = LevWeights::new(10, 20, 30); +// w.swap(); +// let expected = LevWeights { +// insertion: 20, +// deletion: 10, +// substitution: 30, +// }; +// assert_eq!(w, expected); +// } + +#[test] +fn test_osa_equal() { + assert_eq!(osa_distance("abcdef", "abcdef"), 0); +} + +#[test] +fn test_osa_empty() { + assert_eq!(osa_distance("", ""), 0); + assert_eq!(osa_distance("abcdef", ""), 6); + assert_eq!(osa_distance("", "abcdef"), 6); +} + +#[test] +fn test_osa_basic() { + assert_eq!(osa_distance("abcd", "ab"), 2); + assert_eq!(osa_distance("ab", "abcd"), 2); + assert_eq!(osa_distance("abcd", "ad"), 2); + assert_eq!(osa_distance("abcd", "cd"), 2); + assert_eq!(osa_distance("abcd", "a"), 3); + assert_eq!(osa_distance("abcd", "c"), 3); + assert_eq!(osa_distance("abcd", "accd"), 1); + assert_eq!(osa_distance("kitten", "sitting"), 3); + assert_eq!(osa_distance("sitting", "kitten"), 3); + assert_eq!(osa_distance("not", "to a"), 3); + assert_eq!(osa_distance("to be a bee", "not to bee"), 6); +} + +#[test] +fn test_osa_trick_skips() { + // Try to trick the part that skips forward and backward + assert_eq!(osa_distance("abcd", "abcd"), 0); + assert_eq!(osa_distance("abcd", "ad"), 2); + assert_eq!(osa_distance("abcd", "cd"), 2); + assert_eq!(osa_distance("abcd", "a"), 3); + assert_eq!(osa_distance("abcd", "b"), 3); + assert_eq!(osa_distance("abcd", "c"), 3); + assert_eq!(osa_distance("abcd", "d"), 3); + assert_eq!(osa_distance("a", "abcd"), 3); + assert_eq!(osa_distance("d", "abcd"), 3); + assert_eq!(osa_distance("notate", "to ate"), 2); + assert_eq!(osa_distance("to ate", "notate"), 2); + assert_eq!(osa_distance("to be a", "not to"), 6); + assert_eq!(osa_distance("not to", "to be a"), 6); + assert_eq!(osa_distance("abccc", "accc"), 1); +} + +#[test] +fn test_osa_limit_one_empty() { + assert_eq!(osa_limit("abcdef", "", 3), 3); + assert_eq!(osa_limit("", "abcdef", 3), 3); + assert_eq!(osa_limit("abcdef", "", 8), 6); + assert_eq!(osa_limit("", "abcdef", 8), 6); +} + +#[test] +fn test_osa_limit() { + // Most of this is tested via damerau() + // just need to validate limits + assert_eq!(osa_limit("abcdef", "000000", 3), 3); + assert_eq!(osa_limit("ab", "0000", 3), 3); +} + +#[test] +fn test_osa_transpose() { + // Target the transpose cost + // assert_eq!(osa_distance("ab", "ba"), 1); + assert_eq!(osa_distance("ab", "bac"), 2); + assert_eq!(osa_distance("xcb", "abc"), 2); + assert_eq!(osa_distance("sitting", "sittign"), 1); + assert_eq!(osa_distance("sitting", "istting"), 1); + assert_eq!(osa_distance("sitting", "isttign"), 2); + assert_eq!(osa_distance("siting", "isteign"), 4); + assert_eq!(osa_distance("kitten", "kitetn"), 1); + // damerau will be different here + assert_eq!(osa_distance("abc", "ca"), 3); +} + +#[test] +fn test_osa_weights() { + assert_eq!( + osa_weight("ab", "ba", 50, &DamerauWeights::new(100, 100, 100, 5)), + 5 + ); +} + +// #[test] +// fn test_osa_weight_insertion() { +// let weights = LevWeights::new(10, 1, 1); +// assert_eq!(osa_weight("", "a", 100, &weights), 10); +// assert_eq!(osa_weight("a", "", 100, &weights), 1); +// assert_eq!(osa_weight("", "ab", 100, &weights), 20); +// assert_eq!(osa_weight("ab", "", 100, &weights), 2); +// assert_eq!(osa_weight("ab", "abcd", 100, &weights), 20); +// assert_eq!(osa_weight("kitten", "sitting", 100, &weights), 12); +// } + +// #[test] +// fn test_osa_weight_deletion() { +// let weights = LevWeights::new(1, 10, 1); +// assert_eq!(osa_weight("", "a", 100, &weights), 1); +// assert_eq!(osa_weight("a", "", 100, &weights), 10); +// assert_eq!(osa_weight("", "ab", 100, &weights), 2); +// assert_eq!(osa_weight("ab", "", 100, &weights), 20); +// assert_eq!(osa_weight("kitten", "sitting", 100, &weights), 3); + +// let weights = LevWeights::new(1, 10, 2); +// assert_eq!(osa_weight("abc", "ac", 100, &weights), 10); +// assert_eq!(osa_weight("abcd", "ac", 100, &weights), 20); +// } + +// #[test] +// fn test_osa_weight_substitution() { +// // Note that when substitution cost is high, the algorithm will prefer +// // a deletion and insertion +// let weights = LevWeights::new(10, 10, 5); +// assert_eq!(osa_weight("a", "b", 100, &weights), 5); +// let weights = LevWeights::new(10, 10, 2); +// assert_eq!(osa_weight("abcd", "acc", 100, &weights), 12); +// let weights = LevWeights::new(4, 3, 2); +// assert_eq!(osa_weight("kitten", "sitting", 100, &weights), 8); +// } diff --git a/stringmetrics/src/lib.rs b/stringmetrics/src/lib.rs index 85bed2e..0114682 100644 --- a/stringmetrics/src/lib.rs +++ b/stringmetrics/src/lib.rs @@ -4,12 +4,9 @@ //! algorithms to determine the similarity of two strings or sets. It currently //! includes a variety of implementations of [Levenshtein //! distance](https://en.wikipedia.org/wiki/Levenshtein_distance), [Hamming -//! distance](https://en.wikipedia.org/wiki/Hamming_distance), and [Jaccard -//! Similarity](https://en.wikipedia.org/wiki/Jaccard_index), with more string -//! metrics expected to be added in the future. It also includes helpful -//! tokenizers for things like splitting sentences into words. -//! -//! # Example +//! distance](https://en.wikipedia.org/wiki/Hamming_distance), OSA distance, and +//! [Jaccard Similarity](https://en.wikipedia.org/wiki/Jaccard_index), with more +//! metrics expected to be added in the future. //! //! ``` //! use stringmetrics::levenshtein; @@ -17,10 +14,12 @@ //! assert_eq!(levenshtein("kitten", "sitting"), 3); //! ``` //! -//! # Algorithm Descriptions +//! # String / Sequence Comparison Algorithms +//! +//! Hamming distance, levenshtein distance and optimal string alignment (OSA) +//! fall into the category of "string comparison algorithms", which give a rough +//! estimate of how similar two sequences (usually strings) are. //! -//! This section seeks to give an overview of the different algorithms contained -//! in this module. See individual functions for usage guidelines. //! //! ## Hamming Distance Algorithm //! @@ -47,9 +46,9 @@ // _(erm... I can't seem to get KaTeX working. Let me know on GitHub if you can // help!)_ //! -//! The funcition [levenshtein][crate::algorithms::levenshtein] implements the -//! following algorithm. Basically, the tool parses from top left to bottom -//! right to create a table like follows, for the classic example: +//! The funcition [levenshtein][crate::levenshtein] implements the following +//! algorithm. Basically, the tool parses from top left to bottom right to +//! create a table like follows, for the classic example: //! //! ```text //! j → 0 1 2 3 4 5 6 7 @@ -85,6 +84,10 @@ //! but adapted to use a single vector. Main memory usage is only that of a //! `Vec` in the same length as the shortest string. //! +//! There is also a generic function, named +//! [`levenshtein_limit_iter`](crate::levenshtein_limit_iter) and similar, that +//! can be used for any iterator. +//! //! Please note: this library eventually aims to replace the current algorithm //! with one that is more performant across varying lengths of strings. The //! interface will not change. @@ -147,9 +150,12 @@ //! since the cost would be much higher (4+3=7 when the substitution cost is //! only 2).) //! -//! ### Note on string comparisons //! -//! All string-based levenshtein algorithms use bytes rather than characters by +//! +//! +//! ## Note on string comparisons +//! +//! All string-based comparison algorithms use bytes rather than characters by //! default. This speeds things up significantly, and usually the difference is //! unimportant. However, if you are working with CJK character sets or emojis, //! you may prefer the somewhat more accurate (but slower) `chars()` usage. This @@ -173,6 +179,8 @@ //! crate](https://docs.rs/unicode-segmentation/latest/unicode_segmentation/) //! can be used to split on the iterable `graphemes(true)`. //! +//! # Set / Token Comparison Algorithms +//! //! ## Jaccard Similarity //! //! Jaccard similarity or the Jaccard Index of two sets is the number of items