Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ rustc-hash = { version = "1.1.0", default-features = false }
memchr = "2.4"
base64 = "0.22"
rmp-serde = "0.15"
arrayvec = "0.7"
cssparser = { version = "0.34", optional = true }
selectors = { version = "0.26", optional = true }
precomputed-hash = "0.1"
Expand Down
6 changes: 4 additions & 2 deletions src/filters/fb_network_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset;
use crate::filters::fb_builder::EngineFlatBuilder;
use crate::filters::network::{FilterTokens, NetworkFilter};
use crate::filters::token_selector::TokenSelector;
use crate::utils::TokensBuffer;

use crate::filters::network::NetworkFilterMaskHelper;
use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder;
Expand Down Expand Up @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
let mut optimizable = HashMap::<ShortHash, Vec<NetworkFilter>>::new();

let mut token_frequencies = TokenSelector::new(rule_list.filters.len());
let mut tokens_buffer = TokensBuffer::default();

{
for network_filter in rule_list.filters {
Expand All @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
};

let multi_tokens = network_filter.get_tokens_optimized();
let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer);
match multi_tokens {
FilterTokens::Empty => {
// No tokens, add to fallback bucket (token 0)
Expand All @@ -171,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
}
FilterTokens::Other(tokens) => {
let best_token = token_frequencies.select_least_used_token(&tokens);
let best_token = token_frequencies.select_least_used_token(tokens);
token_frequencies.record_usage(best_token);
store_filter(best_token);
}
Expand Down
50 changes: 25 additions & 25 deletions src/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{
use crate::lists::ParseOptions;
use crate::regex_manager::RegexManager;
use crate::request;
use crate::utils::{self, Hash};

pub(crate) const TOKENS_BUFFER_SIZE: usize = 200;
use crate::utils::{self, Hash, TokensBuffer};

/// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns.
static VALID_PARAM: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap());
Expand Down Expand Up @@ -312,10 +310,10 @@ pub enum FilterPart {
}

#[derive(Debug, PartialEq)]
pub enum FilterTokens {
pub enum FilterTokens<'a> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is now also a breaking change 🥲

Empty,
OptDomains(Vec<Hash>),
Other(Vec<Hash>),
OptDomains(&'a [Hash]),
Other(&'a [Hash]),
}

pub struct FilterPartIterator<'a> {
Expand Down Expand Up @@ -885,17 +883,21 @@ impl NetworkFilter {

#[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")]
pub fn get_tokens(&self) -> Vec<Vec<Hash>> {
match self.get_tokens_optimized() {
let mut tokens_buffer = TokensBuffer::default();
match self.get_tokens_optimized(&mut tokens_buffer) {
FilterTokens::OptDomains(domains) => {
domains.into_iter().map(|domain| vec![domain]).collect()
domains.iter().map(|domain| vec![*domain]).collect()
}
FilterTokens::Other(tokens) => vec![tokens],
FilterTokens::Other(tokens) => vec![tokens.to_vec()],
FilterTokens::Empty => vec![],
}
}

pub fn get_tokens_optimized(&self) -> FilterTokens {
let mut tokens: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
pub fn get_tokens_optimized<'a>(
&'a self,
tokens_buffer: &'a mut TokensBuffer,
) -> FilterTokens<'a> {
tokens_buffer.clear();

// If there is only one domain and no domain negation, we also use this
// domain as a token.
Expand All @@ -905,7 +907,7 @@ impl NetworkFilter {
{
if let Some(domains) = self.opt_domains.as_ref() {
if let Some(domain) = domains.first() {
tokens.push(*domain)
tokens_buffer.push(*domain);
}
}
}
Expand All @@ -918,7 +920,7 @@ impl NetworkFilter {
(self.is_plain() || self.is_regex()) && !self.is_right_anchor();
let skip_first_token = self.is_right_anchor();

utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens);
utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer);
}
}
FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request
Expand All @@ -928,45 +930,43 @@ impl NetworkFilter {
// Append tokens from hostname, if any
if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) {
if let Some(hostname) = self.hostname.as_ref() {
utils::tokenize_to(hostname, &mut tokens);
utils::tokenize_to(hostname, tokens_buffer);
}
} else if let Some(hostname) = self.hostname.as_ref() {
// Find last dot to tokenize the prefix
let last_dot_pos = hostname.rfind('.');
if let Some(last_dot_pos) = last_dot_pos {
utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens);
utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer);
}
}

if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if let Some(removeparam) = &self.modifier_option {
if VALID_PARAM.is_match(removeparam) {
utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens);
utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer);
}
}
}

// If we got no tokens for the filter/hostname part, then we will dispatch
// this filter in multiple buckets based on the domains option.
if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() {
if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none()
{
if let Some(opt_domains) = self.opt_domains.as_ref() {
if !opt_domains.is_empty() {
return FilterTokens::OptDomains(opt_domains.clone());
return FilterTokens::OptDomains(opt_domains);
}
}
FilterTokens::Empty
} else {
// Add optional token for protocol
if self.for_http() && !self.for_https() {
tokens.push(utils::fast_hash("http"));
tokens_buffer.push(utils::fast_hash("http"));
} else if self.for_https() && !self.for_http() {
tokens.push(utils::fast_hash("https"));
tokens_buffer.push(utils::fast_hash("https"));
}

// Remake a vector to drop extra capacity.
let mut t = Vec::with_capacity(tokens.len());
t.extend(tokens);
FilterTokens::Other(t)
FilterTokens::Other(tokens_buffer.as_slice())
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,11 @@ impl Request {
}

fn calculate_tokens(url_lower_cased: &str) -> Vec<utils::Hash> {
let mut tokens = vec![];
let mut tokens = utils::TokensBuffer::default();
utils::tokenize_pooled(url_lower_cased, &mut tokens);
// Add zero token as a fallback to wildcard rule bucket
tokens.push(0);
tokens
tokens.into_iter().collect()
}

#[cfg(test)]
Expand Down
26 changes: 13 additions & 13 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use seahash::hash;
#[cfg(target_pointer_width = "32")]
use seahash::reference::hash;

pub use arrayvec::ArrayVec;

pub type Hash = u64;

// A smaller version of Hash that is used in serialized format.
Expand All @@ -27,25 +29,23 @@ fn is_allowed_filter(ch: char) -> bool {
ch.is_alphanumeric() || ch == '%'
}

pub(crate) const TOKENS_BUFFER_SIZE: usize = 128;
pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1;
const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED;
pub type TokensBuffer = ArrayVec<Hash, 256>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc comment would be useful here


fn fast_tokenizer_no_regex(
pattern: &str,
is_allowed_code: &dyn Fn(char) -> bool,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
// let mut tokens_buffer_index = 0;
let mut inside: bool = false;
let mut start = 0;
let mut preceding_ch: Option<char> = None; // Used to check if a '*' is not just before a token

for (i, c) in pattern.char_indices() {
if tokens_buffer.len() >= TOKENS_MAX {
return;
if tokens_buffer.capacity() - tokens_buffer.len() <= 1 {
return; // reserve one free slot for the zero token
}
if is_allowed_code(c) {
if !inside {
Expand Down Expand Up @@ -75,17 +75,17 @@ fn fast_tokenizer_no_regex(
}
}

pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

pub fn tokenize(pattern: &str) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_to(pattern, &mut tokens_buffer);
tokens_buffer
tokens_buffer.into_iter().collect()
}

pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

Expand All @@ -95,21 +95,21 @@ pub(crate) fn tokenize_filter(
skip_first_token: bool,
skip_last_token: bool,
) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_filter_to(
pattern,
skip_first_token,
skip_last_token,
&mut tokens_buffer,
);
tokens_buffer
tokens_buffer.into_iter().collect()
}

pub(crate) fn tokenize_filter_to(
pattern: &str,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
fast_tokenizer_no_regex(
pattern,
Expand Down
2 changes: 1 addition & 1 deletion tests/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() {
let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests);
let msg = "The number of blocked/passed requests has changed. ".to_string()
+ "If this is expected, update the expected values in the test.";
assert_eq!((blocked, passes), (106860, 136085), "{msg}");
assert_eq!((blocked, passes), (106861, 136084), "{msg}");
}
8 changes: 3 additions & 5 deletions tests/unit/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,12 +1191,10 @@ mod parse_tests {
fn test_simple_pattern_tokenization() {
let rule = "||some.primewire.c*/sw$script,1p";
let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap();
let mut tokens_buffer = utils::TokensBuffer::default();
assert_eq!(
filter.get_tokens_optimized(),
FilterTokens::Other(vec![
utils::fast_hash("some"),
utils::fast_hash("primewire")
])
filter.get_tokens_optimized(&mut tokens_buffer),
FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")])
);
}
}