Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/filters/fb_network_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset;
use crate::filters::fb_builder::EngineFlatBuilder;
use crate::filters::network::{FilterTokens, NetworkFilter};
use crate::filters::token_selector::TokenSelector;
use crate::utils::TokensBuffer;

use crate::filters::network::NetworkFilterMaskHelper;
use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder;
Expand Down Expand Up @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
let mut optimizable = HashMap::<ShortHash, Vec<NetworkFilter>>::new();

let mut token_frequencies = TokenSelector::new(rule_list.filters.len());
let mut tokens_buffer = TokensBuffer::default();

{
for network_filter in rule_list.filters {
Expand All @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
};

let multi_tokens = network_filter.get_tokens_optimized();
let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer);
match multi_tokens {
FilterTokens::Empty => {
// No tokens, add to fallback bucket (token 0)
Expand All @@ -171,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
}
FilterTokens::Other(tokens) => {
let best_token = token_frequencies.select_least_used_token(&tokens);
let best_token = token_frequencies.select_least_used_token(tokens);
token_frequencies.record_usage(best_token);
store_filter(best_token);
}
Expand Down
50 changes: 25 additions & 25 deletions src/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{
use crate::lists::ParseOptions;
use crate::regex_manager::RegexManager;
use crate::request;
use crate::utils::{self, Hash};

pub(crate) const TOKENS_BUFFER_SIZE: usize = 200;
use crate::utils::{self, Hash, TokensBuffer};

/// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns.
static VALID_PARAM: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap());
Expand Down Expand Up @@ -312,10 +310,10 @@ pub enum FilterPart {
}

#[derive(Debug, PartialEq)]
pub enum FilterTokens {
pub enum FilterTokens<'a> {
Empty,
OptDomains(Vec<Hash>),
Other(Vec<Hash>),
OptDomains(&'a [Hash]),
Other(&'a [Hash]),
}

pub struct FilterPartIterator<'a> {
Expand Down Expand Up @@ -885,17 +883,21 @@ impl NetworkFilter {

#[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")]
pub fn get_tokens(&self) -> Vec<Vec<Hash>> {
match self.get_tokens_optimized() {
let mut tokens_buffer = TokensBuffer::default();
match self.get_tokens_optimized(&mut tokens_buffer) {
FilterTokens::OptDomains(domains) => {
domains.into_iter().map(|domain| vec![domain]).collect()
domains.iter().map(|domain| vec![*domain]).collect()
}
FilterTokens::Other(tokens) => vec![tokens],
FilterTokens::Other(tokens) => vec![tokens.to_vec()],
FilterTokens::Empty => vec![],
}
}

pub fn get_tokens_optimized(&self) -> FilterTokens {
let mut tokens: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
pub fn get_tokens_optimized<'a>(
&'a self,
tokens_buffer: &'a mut TokensBuffer,
) -> FilterTokens<'a> {
tokens_buffer.clear();

// If there is only one domain and no domain negation, we also use this
// domain as a token.
Expand All @@ -905,7 +907,7 @@ impl NetworkFilter {
{
if let Some(domains) = self.opt_domains.as_ref() {
if let Some(domain) = domains.first() {
tokens.push(*domain)
tokens_buffer.push(*domain);
}
}
}
Expand All @@ -918,7 +920,7 @@ impl NetworkFilter {
(self.is_plain() || self.is_regex()) && !self.is_right_anchor();
let skip_first_token = self.is_right_anchor();

utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens);
utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer);
}
}
FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request
Expand All @@ -928,45 +930,43 @@ impl NetworkFilter {
// Append tokens from hostname, if any
if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) {
if let Some(hostname) = self.hostname.as_ref() {
utils::tokenize_to(hostname, &mut tokens);
utils::tokenize_to(hostname, tokens_buffer);
}
} else if let Some(hostname) = self.hostname.as_ref() {
// Find last dot to tokenize the prefix
let last_dot_pos = hostname.rfind('.');
if let Some(last_dot_pos) = last_dot_pos {
utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens);
utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer);
}
}

if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if let Some(removeparam) = &self.modifier_option {
if VALID_PARAM.is_match(removeparam) {
utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens);
utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer);
}
}
}

// If we got no tokens for the filter/hostname part, then we will dispatch
// this filter in multiple buckets based on the domains option.
if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() {
if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none()
{
if let Some(opt_domains) = self.opt_domains.as_ref() {
if !opt_domains.is_empty() {
return FilterTokens::OptDomains(opt_domains.clone());
return FilterTokens::OptDomains(opt_domains);
}
}
FilterTokens::Empty
} else {
// Add optional token for protocol
if self.for_http() && !self.for_https() {
tokens.push(utils::fast_hash("http"));
tokens_buffer.push(utils::fast_hash("http"));
} else if self.for_https() && !self.for_http() {
tokens.push(utils::fast_hash("https"));
tokens_buffer.push(utils::fast_hash("https"));
}

// Remake a vector to drop extra capacity.
let mut t = Vec::with_capacity(tokens.len());
t.extend(tokens);
FilterTokens::Other(t)
FilterTokens::Other(tokens_buffer.as_slice())
}
}
}
Expand Down
54 changes: 54 additions & 0 deletions src/flatbuffers/unsafe_tools.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,57 @@ impl VerifiedFlatbufferMemory {
&self.raw_data[self.start..]
}
}

/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization.
/// All elements are initialized to T::default(), and we track the logical size separately.
/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency.
pub struct StackVector<T, const MAX_SIZE: usize> {
data: [T; MAX_SIZE],
size: usize,
}

impl<T: Default + Copy, const MAX_SIZE: usize> Default for StackVector<T, MAX_SIZE> {
fn default() -> Self {
Self {
data: [T::default(); MAX_SIZE],
size: 0,
}
}
}

impl<T: Default, const MAX_SIZE: usize> StackVector<T, MAX_SIZE> {
pub fn push(&mut self, value: T) -> bool {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind to panic instead of returning bool ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we shouldn't panic by default.
If a caller need, it can use if (!push(..)) { panic!(...); }

if self.size < MAX_SIZE {
self.data[self.size] = value;
self.size += 1;
true
} else {
false
}
}

pub fn is_empty(&self) -> bool {
self.size == 0
}

pub fn get_free_capacity(&self) -> usize {
MAX_SIZE - self.size
}

pub fn clear(&mut self) {
self.size = 0;
}

pub fn as_slice(&self) -> &[T] {
&self.data[..self.size]
}

pub fn into_vec(mut self) -> Vec<T> {
let mut v = Vec::with_capacity(self.size);
for i in 0..self.size {
v.push(std::mem::take(&mut self.data[i]));
}
self.size = 0;
v
}
}
4 changes: 2 additions & 2 deletions src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,11 @@ impl Request {
}

fn calculate_tokens(url_lower_cased: &str) -> Vec<utils::Hash> {
let mut tokens = vec![];
let mut tokens = utils::TokensBuffer::default();
utils::tokenize_pooled(url_lower_cased, &mut tokens);
// Add zero token as a fallback to wildcard rule bucket
tokens.push(0);
tokens
tokens.into_vec()
}

#[cfg(test)]
Expand Down
26 changes: 13 additions & 13 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use seahash::hash;
#[cfg(target_pointer_width = "32")]
use seahash::reference::hash;

use crate::flatbuffers::unsafe_tools::StackVector;

pub type Hash = u64;

// A smaller version of Hash that is used in serialized format.
Expand All @@ -27,25 +29,23 @@ fn is_allowed_filter(ch: char) -> bool {
ch.is_alphanumeric() || ch == '%'
}

pub(crate) const TOKENS_BUFFER_SIZE: usize = 128;
pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1;
const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED;
pub type TokensBuffer = StackVector<Hash, 256>;

fn fast_tokenizer_no_regex(
pattern: &str,
is_allowed_code: &dyn Fn(char) -> bool,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
// let mut tokens_buffer_index = 0;
let mut inside: bool = false;
let mut start = 0;
let mut preceding_ch: Option<char> = None; // Used to check if a '*' is not just before a token

for (i, c) in pattern.char_indices() {
if tokens_buffer.len() >= TOKENS_MAX {
return;
if tokens_buffer.get_free_capacity() <= 1 {
return; // reserve one free slot for the zero token
}
if is_allowed_code(c) {
if !inside {
Expand Down Expand Up @@ -75,17 +75,17 @@ fn fast_tokenizer_no_regex(
}
}

pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

pub fn tokenize(pattern: &str) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_to(pattern, &mut tokens_buffer);
tokens_buffer
tokens_buffer.into_vec()
}

pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

Expand All @@ -95,21 +95,21 @@ pub(crate) fn tokenize_filter(
skip_first_token: bool,
skip_last_token: bool,
) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_filter_to(
pattern,
skip_first_token,
skip_last_token,
&mut tokens_buffer,
);
tokens_buffer
tokens_buffer.into_vec()
}

pub(crate) fn tokenize_filter_to(
pattern: &str,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
fast_tokenizer_no_regex(
pattern,
Expand Down
2 changes: 1 addition & 1 deletion tests/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() {
let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests);
let msg = "The number of blocked/passed requests has changed. ".to_string()
+ "If this is expected, update the expected values in the test.";
assert_eq!((blocked, passes), (106860, 136085), "{msg}");
assert_eq!((blocked, passes), (106861, 136084), "{msg}");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this expected ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned in the PR description: we hits the token limits for one rule.
If you take master and increase the limits, you will get the same behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm, the reason is in the descr

}
8 changes: 3 additions & 5 deletions tests/unit/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,12 +1191,10 @@ mod parse_tests {
fn test_simple_pattern_tokenization() {
let rule = "||some.primewire.c*/sw$script,1p";
let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap();
let mut tokens_buffer = utils::TokensBuffer::default();
assert_eq!(
filter.get_tokens_optimized(),
FilterTokens::Other(vec![
utils::fast_hash("some"),
utils::fast_hash("primewire")
])
filter.get_tokens_optimized(&mut tokens_buffer),
FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")])
);
}
}