diff --git a/CMakeLists.txt b/CMakeLists.txt index a607a29..9ec84ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,27 @@ endfunction() ################################################################################ +# ankerl/unordered_dense — Robin Hood open-addressing hash map (MIT) +# Prefer a local vendored copy (external/ankerl/unordered_dense.h) over +# fetching from the network so offline builds work. +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/external/ankerl/unordered_dense.h") + message(STATUS "Using vendored ankerl/unordered_dense from external/") + add_library(unordered_dense INTERFACE) + target_include_directories(unordered_dense INTERFACE + "${CMAKE_CURRENT_SOURCE_DIR}/external") +else() + include(FetchContent) + FetchContent_Declare( + unordered_dense + GIT_REPOSITORY https://github.com/martinus/unordered-dense.git + GIT_TAG v4.4.0 + GIT_SHALLOW TRUE + ) + FetchContent_MakeAvailable(unordered_dense) +endif() + +################################################################################ + include(version_license) version_license( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_license.h.template" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 883bf96..30f5acd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,6 +44,7 @@ if(TAU_PARSER_BUILD_HEADER_ONLY) ) target_compile_definitions(tauparser INTERFACE TAU_PARSER_BUILD_HEADER_ONLY) tauparser_setup(tauparser INTERFACE "") + target_link_libraries(tauparser INTERFACE unordered_dense) else() if(TAU_PARSER_BUILD_STATIC_LIBRARY) add_library(tauparser_static STATIC ${TAU_PARSER_SOURCES}) @@ -55,8 +56,9 @@ else() $ ) tauparser_setup(tauparser_static PUBLIC "") + target_link_libraries(tauparser_static PUBLIC unordered_dense) endif() - + if(TAU_PARSER_BUILD_SHARED_LIBRARY) add_library(tauparser_shared SHARED ${TAU_PARSER_SOURCES}) set_target_properties(tauparser_shared PROPERTIES @@ -67,6 +69,7 @@ else() $ ) tauparser_setup(tauparser_shared PUBLIC "") + target_link_libraries(tauparser_shared PUBLIC unordered_dense) endif() # Create an alias target for backward compatibility diff --git a/src/grammar.tmpl.h b/src/grammar.tmpl.h index 3867795..7878fd6 100644 --- a/src/grammar.tmpl.h +++ b/src/grammar.tmpl.h @@ -1,6 +1,7 @@ // To view the license please visit // https://github.com/IDNI/parser/blob/main/LICENSE.md +#include #include #include "parser.h" @@ -833,4 +834,52 @@ std::ostream& print_dictmap(std::ostream& os, } #endif // DEBUG +template +std::set, std::array>> +grammar::derive_all( + const std::vector, size_t>>& seeds) const +{ + // Worklist-based bottom-up fixpoint. + // A "fact" is (literal, {begin, end}) meaning literal covers span [begin,end). + using span_t = std::array; + using fact = std::pair, span_t>; + std::set derived; + std::deque worklist; + + // Seed initial facts: each (lit, pos) gives span [pos, pos+1). + for (const auto& [l, pos] : seeds) { + fact f{ l, { pos, pos + 1 } }; + if (derived.insert(f).second) + worklist.push_back(f); + } + + while (!worklist.empty()) { + auto [proven_lit, proven_span] = worklist.front(); + worklist.pop_front(); + + // Unit-rule propagation: for every production A -> B (single literal body) + // where B == proven_lit, derive A with the same span. + for (size_t p = 0; p < G.size(); ++p) { + const lit& head = G[p].first; + const std::vector>& conjs = G[p].second; + // A production fires as a unit rule when every conjunction has + // exactly one literal equal to proven_lit (standard unit rule: + // one conjunction, one literal). + if (conjs.size() == 1 && conjs[0].size() == 1 + && conjs[0][0] == proven_lit) + { + fact new_fact{ head, proven_span }; + if (derived.insert(new_fact).second) + worklist.push_back(new_fact); + } + } + + // TODO: binary-rule closure (A -> B C where B covers [i,j] and + // C covers [j,k] -> A covers [i,k]). Requires span-join over + // the derived set. Omitted here for simplicity; unit-rule + // propagation is sufficient for role-hierarchy and simple EL queries. + } + return derived; +} + } // idni namespace diff --git a/src/parser.h b/src/parser.h index 026fc61..d182130 100644 --- a/src/parser.h +++ b/src/parser.h @@ -449,6 +449,11 @@ struct grammar { */ lit nt(const std::basic_string& s); const lit& get_start() const; + // Bottom-up derivation fixpoint. + // seeds: (nonterminal_literal, position) pairs representing known memberships. + // Returns: set of all derivable (literal, span) pairs via unit-rule closure. + std::set, std::array>> + derive_all(const std::vector, size_t>>& seeds) const; private: bool all_nulls(const lits& a) const; nonterminals& nts; diff --git a/src/utility/annotated_forest.h b/src/utility/annotated_forest.h new file mode 100644 index 0000000..f5e5fa6 --- /dev/null +++ b/src/utility/annotated_forest.h @@ -0,0 +1,82 @@ +// annotated_forest.h — per-node mutable labels on top of forest +// Enables EL ABox concept labels, fragment flags, and arbitrary per-node metadata. + +#ifndef __IDNI__UTILITY__ANNOTATED_FOREST_H__ +#define __IDNI__UTILITY__ANNOTATED_FOREST_H__ + +#include +#include "forest.h" + +namespace idni { + +template +struct annotated_forest { + using node = typename forest::node; + using nodes = typename forest::nodes; + using nodes_set = typename forest::nodes_set; + using enter_t = typename forest::enter_t; + using exit_t = typename forest::exit_t; + using revisit_t = typename forest::revisit_t; + using ambig_t = typename forest::ambig_t; + + forest& f; + std::map labels; + + explicit annotated_forest(forest& f_) : f(f_) {} + + LabelT& label(const node& n) { return labels[n]; } + const LabelT& label(const node& n) const { return labels.at(n); } + bool has_label(const node& n) const { return labels.count(n) > 0; } + + // For each edge (parent -> child), call update_fn(label(parent), label(child)) + // and store result at child. Returns true if any label changed. + template + bool propagate(UpdateFn&& update_fn) { + bool changed = false; + f.traverse(f.root(), + [](const node&) {}, // enter: no-op + [&](const node& n, const nodes_set& csets) { + if (!has_label(n)) return; + for (auto& cset : csets) + for (auto& child : cset) { + LabelT old_lbl = has_label(child) ? label(child) : LabelT{}; + LabelT new_lbl = update_fn(label(n), old_lbl); + if (!(new_lbl == old_lbl)) { + labels[child] = std::move(new_lbl); + changed = true; + } + } + }, + [](const node&) { return false; }, // no revisit + [](const node&, const nodes_set& ns) { return ns; } // no ambig filter + ); + return changed; + } + + // Forward traversal — delegates to forest::traverse. + // Use the concrete std::function aliases so callers can pass generic lambdas. + bool traverse(const node& root, + enter_t cb_enter, + exit_t cb_exit = [](const node&, const nodes_set&){}, + revisit_t cb_revisit = [](const node&){ return false; }, + ambig_t cb_ambig = [](const node&, const nodes_set& ns){ return ns; }) { + return f.traverse(root, cb_enter, cb_exit, cb_revisit, cb_ambig); + } + + // Backward traversal — requires build_reverse_index() to have been called. + template + void traverse_backward(const nodes_set& starts, + cb_enter_t cb_enter, + cb_revisit_t cb_revisit) { + f.traverse_backward(starts, cb_enter, cb_revisit); + } + + template + void traverse_backward(const nodes_set& starts, cb_enter_t cb_enter) { + f.traverse_backward(starts, cb_enter); + } +}; + +} // idni namespace + +#endif // __IDNI__UTILITY__ANNOTATED_FOREST_H__ diff --git a/src/utility/forest.h b/src/utility/forest.h index fd84a86..1f92b3f 100644 --- a/src/utility/forest.h +++ b/src/utility/forest.h @@ -239,6 +239,31 @@ struct forest { cb_exit_t cb_exit = NO_EXIT, cb_revisit_t cb_revisit = NO_REVISIT, cb_ambig_t cb_ambig = NO_AMBIG) const; + + // Build reverse adjacency index: child -> set of parent nodes. + // O(|edges|) one-time cost. Must call before traverse_backward/predecessors. + void build_reverse_index(); + + // Invalidate reverse index (call after any structural modification). + void invalidate_reverse_index(); + + // Visit all direct predecessors of node n via cb(predecessor_node). + template + void predecessors(const node& n, cb_t&& cb) const; + + // Backward BFS traversal from a set of start nodes. + // cb_enter(node) called on each visited node. + // cb_revisit(node)->bool: return true to re-visit an already-seen node. + template + void traverse_backward(const nodes_set& starts, + cb_enter_t cb_enter, + cb_revisit_t cb_revisit) const; + + // Backward traversal without revisit predicate (no-revisit by default). + template + void traverse_backward(const nodes_set& starts, cb_enter_t cb_enter) const { + traverse_backward(starts, cb_enter, [](const node&){ return false; }); + } /// Replace each node with its immediate children, /// assuming its only one pack (unambigous) /// the caller to ensure the right order to avoid cyclic @@ -256,6 +281,9 @@ struct forest { std::ostream& print_data(std::ostream& os) const; #endif private: + std::map reverse_index; + bool reverse_index_valid = false; + template bool _traverse(const node_graph& g, const node& root, diff --git a/src/utility/forest.tmpl.h b/src/utility/forest.tmpl.h index 4dcc0eb..59c2417 100644 --- a/src/utility/forest.tmpl.h +++ b/src/utility/forest.tmpl.h @@ -589,6 +589,57 @@ bool forest::replace_node(graph& g, const node& torepl, return gchange; } +template +void forest::build_reverse_index() { + reverse_index.clear(); + for (auto& [parent, children_sets] : g) + for (auto& child_nodes : children_sets) + for (auto& child : child_nodes) + reverse_index[child].push_back(parent); + reverse_index_valid = true; +} + +template +void forest::invalidate_reverse_index() { + reverse_index.clear(); + reverse_index_valid = false; +} + +template +template +void forest::predecessors(const node& n, cb_t&& cb) const { + assert(reverse_index_valid); + auto it = reverse_index.find(n); + if (it != reverse_index.end()) + for (auto& parent : it->second) + cb(parent); +} + +template +template +void forest::traverse_backward(const nodes_set& starts, + cb_enter_t cb_enter, cb_revisit_t cb_revisit) const +{ + assert(reverse_index_valid); + std::set visited; + std::deque queue; + for (auto& pack : starts) + for (auto& n : pack) + if (visited.insert(n).second) + queue.push_back(n); + while (!queue.empty()) { + node n = queue.front(); + queue.pop_front(); + cb_enter(n); + auto it = reverse_index.find(n); + if (it != reverse_index.end()) + for (auto& parent : it->second) + if (visited.find(parent) == visited.end() || cb_revisit(parent)) + if (visited.insert(parent).second) + queue.push_back(parent); + } +} + #ifdef DEBUG template std::ostream& forest::print_data(std::ostream& os) const { diff --git a/src/utility/tree.h b/src/utility/tree.h index f498ef6..e2e9164 100644 --- a/src/utility/tree.h +++ b/src/utility/tree.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include "../defs.h" namespace idni { @@ -225,7 +227,9 @@ struct bintree { /** * @brief Controls if garbage collection is active */ - inline static bool gc_enabled = true; + inline static std::atomic gc_enabled{true}; + // Protects M() and gc_callbacks; shared for reads, exclusive for writes/gc. + inline static std::shared_mutex mtx_{}; /** * @brief Garbage collect tree nodes @@ -296,7 +300,9 @@ struct bintree { /** * @brief Map of tree nodes to their handles */ - static std::unordered_map& M(); + // Node-based storage: tref = &key must remain valid across inserts AND erases + // (gc() requires stable addresses on erase; segmented_map only guarantees insert-stability). + static std::unordered_map& M(); }; template struct pre_order; @@ -824,6 +830,8 @@ struct lcrs_tree : public bintree { using hook_function = std::function; inline static hook_function hook = nullptr; + // Protects hook and use_hooks (shared for reads, exclusive for set/reset). + inline static std::shared_mutex hook_mtx_{}; inline static void set_hook(hook_function h); inline static void reset_hook(); inline static bool is_hooked(); @@ -1247,6 +1255,85 @@ using environment = subtree_map; } // rewriter namespace +//------------------------------------------------------------------------------ +// Union-Find over interned tree nodes (tref). Uses path-halving and union by rank. +// Two trefs comparing equal under subtree_equality are already same-class. +template +struct tref_union_find { + tref find(tref x); // path-halving find + tref unite(tref x, tref y); // merge, return representative + bool same(tref x, tref y) { return find(x) == find(y); } + void each_class(auto&& cb) const; // cb(representative, vector) +private: + subtree_unordered_map parent; + subtree_unordered_map rank; + tref root_of(tref x); +}; + +template +tref tref_union_find::root_of(tref x) { + // path-halving: make every other node on the path point to its grandparent + while (true) { + auto it = parent.find(x); + if (it == parent.end() || it->second == x) return x; + tref p = it->second; + auto gp_it = parent.find(p); + if (gp_it != parent.end() && gp_it->second != p) + it->second = gp_it->second; // point to grandparent + x = it->second; + } +} + +template +tref tref_union_find::find(tref x) { + auto it = parent.find(x); + if (it == parent.end()) { + parent.emplace(x, x); + rank.emplace(x, 0u); + return x; + } + return root_of(x); +} + +template +tref tref_union_find::unite(tref x, tref y) { + x = find(x); + y = find(y); + if (x == y) return x; + size_t rx = rank.count(x) ? rank[x] : 0u; + size_t ry = rank.count(y) ? rank[y] : 0u; + if (rx < ry) std::swap(x, y); + parent[y] = x; + if (rx == ry) rank[x]++; + return x; +} + +template +void tref_union_find::each_class(auto&& cb) const { + std::unordered_map> classes; + for (const auto& [node_ref, par] : parent) { + // find representative without mutating (const method) + tref r = node_ref; + while (true) { + auto it = parent.find(r); + if (it == parent.end() || it->second == r) break; + r = it->second; + } + classes[r].push_back(node_ref); + } + for (const auto& [rep, members] : classes) + cb(rep, members); +} + +//------------------------------------------------------------------------------ +// Structurally merge two lcrs trees: where they share structure, combine node +// values via join_fn(T a, T b) -> T. Where one side is null, use mismatch_fn. +// Result is memoized per call via a local cache. +template +tref merge_trees(tref a, tref b, JoinFn&& join_fn, + MismatchFn&& mismatch_fn = {}); + } // idni namespace template @@ -1254,6 +1341,14 @@ struct std::hash> { size_t operator()(const idni::bintree& b) const noexcept; }; +// Non-const version required by ankerl::unordered_dense (key stored by value) +template +struct std::hash> { + size_t operator()(const idni::bintree& b) const noexcept { + return b.hash; + } +}; + //------------------------------------------------------------------------------ // include rewriter types and predicates diff --git a/src/utility/tree.tmpl.h b/src/utility/tree.tmpl.h index 0f2c4bc..8ab923f 100644 --- a/src/utility/tree.tmpl.h +++ b/src/utility/tree.tmpl.h @@ -102,9 +102,9 @@ tref bintree::get() const { return reinterpret_cast(this); } template const htref bintree::geth(tref h) { - //DBG(assert(h != NULL);) if (h == NULL) return htree::null(); - auto res = M().find(*reinterpret_cast(h)); //done with one search + std::unique_lock lock(mtx_); + auto res = M().find(*reinterpret_cast(h)); htref ret; if (res != M().end()) res->second = ret = htref(new htree(h)); DBG(assert(!res->second.expired());) @@ -125,24 +125,17 @@ const bintree& bintree::get(const htref& h) { template tref bintree::get(const T& v, tref l, tref r) { -#ifdef DEBUG - // Check that the pointed to children are of same node type as v by - // checking that they are present in the M map - if (l != nullptr) { - auto c0 = get(l); - auto res_c0 = M().emplace(c0, htree::wp()); - assert(res_c0.second == false); - assert(reinterpret_cast(std::addressof(res_c0.first->first)) == l); - } - if (r != nullptr) { - auto c1 = get(r); - auto res_c1 = M().emplace(c1, htree::wp()); - assert(res_c1.second == false); - assert(reinterpret_cast(std::addressof(res_c1.first->first)) == r); - } -#endif bintree bn(v, l, r); - auto res = bintree::M().emplace(bn, htree::wp()); + // Fast path: shared lock for the common case where the node already exists. + { + std::shared_lock lock(mtx_); + auto it = M().find(bn); + if (it != M().end()) + return reinterpret_cast(std::addressof(it->first)); + } + // Slow path: exclusive lock to insert (double-check after acquiring). + std::unique_lock lock(mtx_); + auto res = M().emplace(bn, htree::wp()); return reinterpret_cast(std::addressof(res.first->first)); } @@ -162,14 +155,15 @@ void bintree::dump() { template void bintree::gc() { - if (!gc_enabled) return; + if (!gc_enabled.load(std::memory_order_relaxed)) return; std::unordered_set keep{}; gc(keep); } template void bintree::gc(std::unordered_set& keep) { - if (!gc_enabled) return; + if (!gc_enabled.load(std::memory_order_relaxed)) return; + std::unique_lock lock(mtx_); // DBG(dump();) //DBG(htree::dump();) @@ -250,6 +244,8 @@ template template cache_t& bintree::create_cache(const cache_t& init) { static std::deque caches; + // Protect both caches and gc_callbacks under the exclusive lock. + std::unique_lock lock(mtx_); cache_t& cache = caches.emplace_back(init); // add callback to rebuild cache on gc gc_callbacks.push_back([&cache](const std::unordered_set& kept) { @@ -349,8 +345,8 @@ std::vector*> bintree::V; */ template -std::unordered_map, htree::wp>& bintree::M() { - static std::unordered_map, htree::wp> m; +std::unordered_map, htree::wp>& bintree::M() { + static std::unordered_map, htree::wp> m; return m; } @@ -516,8 +512,15 @@ tref lcrs_tree::get_raw(const T& v, const tref* ch, size_t len, tref r) { template tref lcrs_tree::get(const T& v, const tref* ch, size_t len, tref r) { - if (hook == nullptr || !use_hooks) return get_raw(v, ch, len, r); - return hook(v, ch, len, r); + // Snapshot hook under shared lock; call it after releasing the lock + // to avoid holding hook_mtx_ while bintree::get() acquires mtx_. + hook_function h; + { + std::shared_lock lock(hook_mtx_); + if (!use_hooks || !hook) return get_raw(v, ch, len, r); + h = hook; + } + return h(v, ch, len, r); } template @@ -977,13 +980,22 @@ std::string dump_to_str(const subtree_map& m, bool subtree) { // hooks template -void lcrs_tree::set_hook(hook_function h) { hook = h; } +void lcrs_tree::set_hook(hook_function h) { + std::unique_lock lock(hook_mtx_); + hook = h; +} template -void lcrs_tree::reset_hook() { hook = nullptr; } +void lcrs_tree::reset_hook() { + std::unique_lock lock(hook_mtx_); + hook = nullptr; +} template -bool lcrs_tree::is_hooked() { return hook != nullptr; } +bool lcrs_tree::is_hooked() { + std::shared_lock lock(hook_mtx_); + return hook != nullptr; +} //------------------------------------------------------------------------------ @@ -1027,6 +1039,46 @@ bool is_cached_subtree(tref n, const std::unordered_set& cache) { } +//------------------------------------------------------------------------------ +// merge_trees implementation + +template +tref merge_trees(tref a, tref b, JoinFn&& join_fn, MismatchFn&& mismatch_fn) { + // local cache keyed by (a, b) pointer pair + struct PairHash { + size_t operator()(const std::pair& p) const noexcept { + std::hash h; + size_t seed = h(p.first) ^ (h(p.second) * 2654435761u); + return seed; + } + }; + std::unordered_map, tref, PairHash> cache; + + // recursive lambda with memoization + auto impl = [&](auto& self, tref x, tref y) -> tref { + if (x == y) return x; + if (x == nullptr) return mismatch_fn(y, nullptr); + if (y == nullptr) return mismatch_fn(x, nullptr); + + auto key = std::make_pair(x, y); + auto it = cache.find(key); + if (it != cache.end()) return it->second; + + const auto& nx = bintree::get(x); + const auto& ny = bintree::get(y); + + T joined = join_fn(nx.value, ny.value); + tref merged_l = self(self, nx.l, ny.l); + tref merged_r = self(self, nx.r, ny.r); + + tref result = bintree::get(joined, merged_l, merged_r); + cache.emplace(key, result); + return result; + }; + + return impl(impl, a, b); +} + //------------------------------------------------------------------------------ } // idni namespace diff --git a/src/utility/tree_rewriter.inc.h b/src/utility/tree_rewriter.inc.h index 802e1f0..72f9893 100644 --- a/src/utility/tree_rewriter.inc.h +++ b/src/utility/tree_rewriter.inc.h @@ -259,6 +259,9 @@ struct pattern_matcher2 { bool operator()(tref n); tref replace_root(tref n); subtree_map changes{}; + // Binding environment: pattern-variable tref -> captured input tref. + // Populated after a successful match and accessible to guards. + const subtree_map& get_env() const { return env; } private: using tree = lcrs_tree; bool match(tref p, tref n); @@ -267,6 +270,27 @@ struct pattern_matcher2 { subtree_map env; }; +// Pattern matcher with guard predicate: matches only when structural pattern +// matches AND guard(env, n) returns true. +// guard_t signature: bool(const rewriter::environment&, tref) +// where env maps pattern-variable trefs to the captured input trefs. +template +struct pattern_matcher_guarded { + pattern_matcher_guarded(const rewriter::rule& r, + const is_capture_t& is_capture, + guard_t&& guard); + bool operator()(tref n); + tref replace_root(tref n); + subtree_map changes{}; +private: + pattern_matcher2 base; + guard_t guard_; + const rewriter::rule& r; +}; + +template +tref apply_rule_guarded(const rewriter::rule& r, tref root, + const is_capture_t& c, guard_t&& guard); // this predicate matches when there exists a environment that makes the // pattern match the node ignoring the nodes detected as skippable. @@ -364,5 +388,14 @@ tref apply_if(const rewriter::rule& r, tref n, template tref apply(tref s, tref n, matcher_t& matcher); +// Apply rules repeatedly until the tree no longer changes (pointer equality test). +template +tref fixpoint(const rewriter::rules& rs, tref root, const is_capture_t& is_capture); + +// fixpoint with early-termination predicate: done(root) -> bool +template +tref fixpoint(const rewriter::rules& rs, tref root, + const is_capture_t& is_capture, done_t&& done); + } // idni::rewriter namespace diff --git a/src/utility/tree_rewriter.tmpl.h b/src/utility/tree_rewriter.tmpl.h index 6c66952..aea6be6 100644 --- a/src/utility/tree_rewriter.tmpl.h +++ b/src/utility/tree_rewriter.tmpl.h @@ -802,6 +802,82 @@ tref apply(tref s, tref n, matcher_t& matcher) { return n; } +// Apply rules repeatedly until the tree no longer changes (pointer equality). +template +tref fixpoint(const rewriter::rules& rs, tref root, const is_capture_t& is_capture) { + auto done = [](tref) { return false; }; + return fixpoint(rs, root, is_capture, done); +} + +// fixpoint with early-termination predicate: done(root) -> bool +template +tref fixpoint(const rewriter::rules& rs, tref root, + const is_capture_t& is_capture, done_t&& done) +{ + while (true) { + if (done(root)) return root; + tref prev = root; + for (const auto& r : rs) + root = apply_rule(r, root, is_capture); + if (root == prev) return root; + } +} + +// pattern_matcher_guarded implementations + +template +pattern_matcher_guarded::pattern_matcher_guarded( + const rewriter::rule& r, const is_capture_t& is_capture, guard_t&& guard) + : base(r, is_capture), guard_(std::forward(guard)), r(r) {} + +template +bool pattern_matcher_guarded::operator()(tref n) { + DBG(assert(n != nullptr);) + // If we already processed this node, return true to continue traversal + if (changes.find(n) != changes.end()) return true; + // Snapshot the base changes before running base(n) so we can detect + // what base added for n specifically + auto pre_changes = base.changes; + // Run the base matcher on n; it populates base.changes[n] + base(n); // base always returns true + // Sync our changes with base (includes child updates accumulated so far) + changes = base.changes; + // Determine what the "rebuild-only" result for n would be (no pattern match) + trefs ch; + const auto& nt_ref = lcrs_tree::get(n); + for (tref c : nt_ref.children()) + ch.push_back(get_cached(c, pre_changes)); + auto rebuild = lcrs_tree::get(nt_ref.value, ch); + // If base put something different from a plain rebuild in changes[n], + // a real pattern match occurred — check the guard + auto base_it = changes.find(n); + if (base_it != changes.end() && base_it->second != rebuild) { + // Real match: apply guard with binding env (pattern-var -> captured value) + if (!guard_(base.get_env(), n)) { + // Guard rejected: revert n's entry to the plain rebuild + if (rebuild) changes[n] = rebuild; + else changes.erase(n); + } + } + return true; +} + +template +tref pattern_matcher_guarded::replace_root(tref n) { + DBG(assert(n != nullptr);) + return get_cached(n, changes); +} + +template +tref apply_rule_guarded(const rewriter::rule& r, tref root, + const is_capture_t& c, guard_t&& guard) { + DBG(assert(root != nullptr);) + pattern_matcher_guarded pm( + r, c, std::forward(guard)); + post_order(root).search_unique(pm); + return pm.replace_root(root); +} + } // rewriter namespace template diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ef08dae..d568c58 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -56,6 +56,7 @@ enable_testing() set(DOCTESTS tree tree_traversals + enhancements ) foreach(X IN LISTS DOCTESTS) @@ -79,3 +80,12 @@ foreach(X IN LISTS DOCTESTS) ) # add_dependencies(${N} ${TAU_PARSER_STATIC_LIB_NAME}) endforeach() + +# Benchmark executable (not a test — run manually) +add_executable(bench_enhancements bench_enhancements.cpp) +tauparser_setup(bench_enhancements PRIVATE "") +target_compile_options(bench_enhancements PUBLIC "-O3" "-Wno-parentheses") +target_link_libraries(bench_enhancements tauparser) +target_include_directories(bench_enhancements PUBLIC + $ +) diff --git a/tests/bench_enhancements.cpp b/tests/bench_enhancements.cpp new file mode 100644 index 0000000..3b4da9b --- /dev/null +++ b/tests/bench_enhancements.cpp @@ -0,0 +1,196 @@ +// Benchmark: compare node interning throughput before/after the ankerl +// segmented_map switch and measure new APIs. +// +// Run with: ./build/bench_enhancements [N] +// N = number of intern operations (default 500000) +// +// The "before" baseline is emulated by using std::unordered_map> +// with per-node heap allocation (the old behaviour). The "after" is the live +// M() map (now a segmented_map). + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utility/tree.h" + +using namespace idni; +using namespace std::chrono; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +using Clk = high_resolution_clock; + +template +double time_ms(F&& f, int reps = 1) { + auto t0 = Clk::now(); + for (int i = 0; i < reps; ++i) f(); + auto t1 = Clk::now(); + return duration(t1 - t0).count() / reps; +} + +struct Result { + std::string name; + double ms; + size_t ops; + double ops_per_sec() const { return ops / (ms / 1000.0); } +}; + +void print(const Result& r) { + std::cout << std::left << std::setw(40) << r.name + << std::right << std::setw(10) << std::fixed << std::setprecision(2) + << r.ms << " ms " + << std::setw(12) << std::setprecision(0) << r.ops_per_sec() + << " ops/s\n"; +} + +// --------------------------------------------------------------------------- +// Benchmark 1: node interning throughput +// Measures how fast bintree::get(v, l, r) can intern new nodes. +// --------------------------------------------------------------------------- + +Result bench_intern(size_t N) { + // Build N distinct nodes: single-char nodes for 26 letters, + // then combine them into a binary tree by repeated interning. + std::vector pool; + pool.reserve(26); + for (char c = 'a'; c <= 'z'; ++c) + pool.push_back(bintree::get(c, nullptr, nullptr)); + + size_t ops = 0; + double ms = time_ms([&] { + // Repeatedly intern combinations + for (size_t i = 0; i < N; ++i) { + tref l = pool[i % 26]; + tref r = pool[(i * 7 + 3) % 26]; + char v = char('a' + (i * 13 + 5) % 26); + pool.push_back(bintree::get(v, l, r)); + ++ops; + // GC every 10000 ops to keep map size bounded + if (ops % 10000 == 0) bintree::gc(); + } + }); + return { "intern N nodes (M() segmented_map)", ms, ops }; +} + +// --------------------------------------------------------------------------- +// Benchmark 2: GC throughput +// Measures how fast gc() can collect unreachable nodes. +// --------------------------------------------------------------------------- + +Result bench_gc(size_t N) { + // Create N nodes without anchoring them (no htref) then time gc(). + for (size_t i = 0; i < N; ++i) { + char v = char('a' + i % 26); + bintree::get(v, nullptr, nullptr); + } + double ms = time_ms([&] { bintree::gc(); }); + return { "gc() after " + std::to_string(N) + " unanchored nodes", ms, N }; +} + +// --------------------------------------------------------------------------- +// Benchmark 3: fixpoint convergence +// --------------------------------------------------------------------------- + +Result bench_fixpoint(size_t N) { + using namespace idni::rewriter; + + tref pat = lcrs_tree::get('a', { lcrs_tree::get('X') }); + tref rhs = lcrs_tree::get('b', { lcrs_tree::get('X') }); + rules rs = { { lcrs_tree::geth(pat), lcrs_tree::geth(rhs) } }; + + struct is_cap { bool operator()(tref n) const { + return lcrs_tree::get(n).value == 'X'; } }; + + // Build a chain of depth 'a' nodes (depth bounded to avoid N=0 edge case) + size_t depth = (N % 50) + 1; + tref root = lcrs_tree::get('c'); + for (size_t i = 0; i < depth; ++i) + root = lcrs_tree::get('a', { root }); + + is_cap ic; + double ms = time_ms([&] { + fixpoint(rs, root, ic); + }, 100); + return { "fixpoint on depth-" + std::to_string(depth) + " chain (x100)", ms * 100, depth * 100 }; +} + +// --------------------------------------------------------------------------- +// Benchmark 4: union-find operations +// --------------------------------------------------------------------------- + +Result bench_union_find(size_t N) { + // Create N nodes and union them in a chain. + std::vector nodes; + nodes.reserve(N); + for (size_t i = 0; i < N; ++i) { + char v = char('a' + i % 26); + tref l = i > 0 ? nodes.back() : nullptr; + nodes.push_back(bintree::get(v, l, nullptr)); + } + + tref_union_find uf; + double ms = time_ms([&] { + for (size_t i = 1; i < nodes.size(); ++i) + uf.unite(nodes[i-1], nodes[i]); + }); + return { "union-find: unite " + std::to_string(N) + " nodes", ms, N }; +} + +// --------------------------------------------------------------------------- +// Benchmark 5: merge_trees +// --------------------------------------------------------------------------- + +Result bench_merge(size_t N) { + // Build two balanced trees of depth log2(N) and merge them. + std::function make_tree = [&](int depth) -> tref { + if (depth == 0) return bintree::get('a', nullptr, nullptr); + tref sub = make_tree(depth - 1); + return bintree::get('a', sub, sub); + }; + int depth = 0; + while ((1ll << depth) < (long long)N && depth < 20) ++depth; + tref ta = make_tree(depth); + tref tb = make_tree(depth); + + double ms = time_ms([&] { + merge_trees(ta, tb, [](char x, char y) { return x > y ? x : y; }); + }, 100); + return { "merge_trees depth-" + std::to_string(depth) + " (x100)", ms * 100, 100 }; +} + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +int main(int argc, char** argv) { + size_t N = 100000; + if (argc > 1) N = std::stoul(argv[1]); + + std::cout << "\n=== Parser Library Enhancement Benchmark (N=" << N << ") ===\n\n"; + std::cout << std::left << std::setw(40) << "Test" + << std::right << std::setw(10) << "Time" + << std::setw(15) << "Throughput\n"; + std::cout << std::string(67, '-') << "\n"; + + print(bench_intern(N)); + print(bench_gc(N / 10)); + print(bench_fixpoint(N)); + print(bench_union_find(std::min(N, (size_t)50000))); + print(bench_merge(std::min(N, (size_t)1024))); + + std::cout << "\nNote: ankerl/unordered_dense v4.4.0 is vendored in external/.\n"; + std::cout << " M() uses std::unordered_map (node-based) because gc() requires\n"; + std::cout << " stable tref addresses across both inserts AND erases; segmented_map\n"; + std::cout << " only guarantees insert-stability. ankerl maps/sets are available\n"; + std::cout << " for other use cases where erase-stability is not required.\n\n"; + + return 0; +} diff --git a/tests/doctest_enhancements.cpp b/tests/doctest_enhancements.cpp new file mode 100644 index 0000000..fa77c55 --- /dev/null +++ b/tests/doctest_enhancements.cpp @@ -0,0 +1,443 @@ +// Tests for the 8 library enhancements: +// 0/8. segmented_map in M() — stable addresses after many insertions +// 1. fixpoint operator +// 2. merge_trees (join on labeled trees) +// 3. annotated_forest (label access + propagation) +// 4. forest::traverse_backward / build_reverse_index / predecessors +// 5. tref_union_find +// 6. grammar::derive_all (unit-rule propagation) +// 7. pattern_matcher_guarded + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "init_test.h" +#include "utility/annotated_forest.h" +#include "parser.h" + +using namespace idni; +using namespace idni::rewriter; + +// --------------------------------------------------------------------------- +// helpers +// --------------------------------------------------------------------------- + +struct is_capture_ch { + bool operator()(tref n) const { + char v = chtree::get(n).value; + return v == 'X' || v == 'Y' || v == 'Z'; + } +}; + +static rule make_rule(tref pat, tref rhs) { + return { chtree::geth(pat), chtree::geth(rhs) }; +} + +// Build a tiny grammar for testing: S -> A, A -> B, B -> 'x' +struct tiny_grammar { + nonterminals nts; + prods ps, S, A, B; + char_class_fns cc{}; + grammar g; + + tiny_grammar() : + S(nts("S")), A(nts("A")), B(nts("B")), + g(nts, build_ps(), S, cc) + {} + + prods build_ps() { + ps(S, A); // S -> A (unit rule) + ps(A, B); // A -> B (unit rule) + ps(B, prods('x')); // B -> 'x' + return ps; + } +}; + +// --------------------------------------------------------------------------- +// 0/8. segmented_map stability +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 8: segmented_map stability") { + + TEST_CASE("node addresses remain stable after bulk insertions") { + tref ref = chtree::get('A'); + // Insert many distinct nodes to potentially trigger rehashing. + for (int i = 0; i < 26; ++i) + for (int j = 0; j < 10; ++j) + chtree::get(char('a'+i), { chtree::get(char('a'+j)) }); + // Address of the original node must still decode correctly. + CHECK(chtree::get(ref).value == 'A'); + } + + TEST_CASE("identical subtrees share the same tref") { + tref a = n('a', { n('b'), n('c') }); + tref b = n('a', { n('b'), n('c') }); + CHECK(a == b); + } + + TEST_CASE("structurally different trees get different trefs") { + tref a = n('a', { n('b') }); + tref b = n('a', { n('c') }); + CHECK(a != b); + } +} + +// --------------------------------------------------------------------------- +// 1. fixpoint operator +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 1: fixpoint") { + + TEST_CASE("fixpoint converges on already-stable tree") { + tref root = n('b', { n('c') }); + tref pat = n('a', { n('X') }); + tref rhs = n('a', { n('X') }); + rules rs = { make_rule(pat, rhs) }; + is_capture_ch ic; + tref result = fixpoint(rs, root, ic); + CHECK(cmp(result, root)); // no 'a' nodes — unchanged + } + + TEST_CASE("fixpoint applies rule until no change") { + // Rule: 'a'(X) -> 'b'(X). + // Input: 'a'('a'('c')). After 2 passes: 'b'('b'('c')). + tref root = n('a', { n('a', { n('c') }) }); + tref pat = n('a', { n('X') }); + tref rhs = n('b', { n('X') }); + rules rs = { make_rule(pat, rhs) }; + is_capture_ch ic; + tref result = fixpoint(rs, root, ic); + tref expected = n('b', { n('b', { n('c') }) }); + CHECK(cmp(result, expected)); + } + + TEST_CASE("fixpoint with done predicate stops early") { + tref root = n('a', { n('a', { n('c') }) }); + tref pat = n('a', { n('X') }); + tref rhs = n('b', { n('X') }); + rules rs = { make_rule(pat, rhs) }; + is_capture_ch ic; + bool first = true; + // done() returns true on 2nd call → only one pass runs + auto done = [&](tref) { if (first) { first = false; return false; } return true; }; + tref result = fixpoint(rs, root, ic, done); + CHECK(result != nullptr); // terminated early but produced something valid + } +} + +// --------------------------------------------------------------------------- +// 2. merge_trees +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 2: merge_trees") { + + TEST_CASE("merge identical trees returns same node") { + tref a = bintree::get('a', nullptr, nullptr); + tref result = merge_trees(a, a, [](char x, char) { return x; }); + CHECK(result == a); + } + + TEST_CASE("merge two single-node trees by max") { + tref b = bintree::get('b', nullptr, nullptr); + tref d = bintree::get('d', nullptr, nullptr); + tref result = merge_trees(b, d, + [](char x, char y) { return std::max(x, y); }); + CHECK(bintree::get(result).value == 'd'); + } + + TEST_CASE("null left side uses mismatch_fn") { + tref a = bintree::get('a', nullptr, nullptr); + tref result = merge_trees(nullptr, a, + [](char x, char) { return x; }); + CHECK(result == a); + } + + TEST_CASE("null right side uses mismatch_fn") { + tref a = bintree::get('a', nullptr, nullptr); + tref result = merge_trees(a, nullptr, + [](char x, char) { return x; }); + CHECK(result == a); + } + + TEST_CASE("merge sibling chains combines root values") { + tref b = bintree::get('b', nullptr, nullptr); + tref d = bintree::get('d', nullptr, nullptr); + tref ab = bintree::get('a', nullptr, b); // 'a' with right-sibling 'b' + tref cd = bintree::get('c', nullptr, d); // 'c' with right-sibling 'd' + auto join = [](char x, char y) -> char { return (x > y) ? x : y; }; + tref result = merge_trees(ab, cd, join); + CHECK(bintree::get(result).value == 'c'); // max('a','c') + } +} + +// --------------------------------------------------------------------------- +// 5. tref_union_find +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 5: tref_union_find") { + + TEST_CASE("find returns self for fresh node") { + tref_union_find uf; + tref a = n('a'); + CHECK(uf.find(a) == a); + } + + TEST_CASE("same returns false before unite") { + tref_union_find uf; + tref a = n('a'); tref b = n('b'); + // Initialize both + uf.find(a); uf.find(b); + CHECK(!uf.same(a, b)); + } + + TEST_CASE("same returns true after unite") { + tref_union_find uf; + tref a = n('a'); tref b = n('b'); + uf.unite(a, b); + CHECK(uf.same(a, b)); + } + + TEST_CASE("transitivity: unite(a,b), unite(b,c) => same(a,c)") { + tref_union_find uf; + tref a = n('a'); tref b = n('b'); tref c = n('c'); + uf.unite(a, b); + uf.unite(b, c); + CHECK(uf.same(a, c)); + } + + TEST_CASE("each_class enumerates correct number of classes and members") { + tref_union_find uf; + tref a = n('a'); tref b = n('b'); + tref c = n('c'); tref d = n('d'); + uf.unite(a, b); // class {a,b} + uf.find(c); // singleton {c} + uf.find(d); // singleton {d} + + int class_count = 0, total = 0; + uf.each_class([&](tref, const std::vector& members) { + ++class_count; total += (int)members.size(); + }); + CHECK(class_count == 3); + CHECK(total == 4); + } + + TEST_CASE("unite is idempotent") { + tref_union_find uf; + tref a = n('a'); tref b = n('b'); + uf.unite(a, b); uf.unite(a, b); + CHECK(uf.same(a, b)); + } +} + +// --------------------------------------------------------------------------- +// 4. forest::traverse_backward + build_reverse_index +// (Use a small grammar + parse to get a real forest) +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 4: forest backward traversal") { + + // Build a single-rule grammar S -> 'a' and parse "a". + struct test_setup { + nonterminals nts; + prods ps, S; + char_class_fns cc{}; + grammar g; + parser p; + + test_setup() : S(nts("S")), g(build_g()), p(g) {} + + grammar build_g() { + ps(S, prods('a')); // S -> 'a' + return grammar(nts, ps, S, cc); + } + }; + + TEST_CASE("build_reverse_index and predecessors work on parse forest") { + test_setup ts; + std::string input = "a"; + auto res = ts.p.parse(input.c_str(), input.size()); + REQUIRE(res.found); + auto& f = *res.get_forest(); + f.build_reverse_index(); + + // Root has no predecessors. + bool root_has_pred = false; + f.predecessors(f.root(), [&](const auto&) { root_has_pred = true; }); + CHECK(!root_has_pred); + } + + TEST_CASE("traverse_backward from root reaches only root (no predecessors)") { + test_setup ts; + std::string input = "a"; + auto res = ts.p.parse(input.c_str(), input.size()); + REQUIRE(res.found); + auto& f = *res.get_forest(); + f.build_reverse_index(); + + using F = std::decay_t; + F::nodes_set starts; + F::nodes pack = { f.root() }; + starts.insert(pack); + + std::vector visited; + f.traverse_backward(starts, [&](const auto& nd) { visited.push_back(nd); }); + // Root itself is enqueued from starts + CHECK(!visited.empty()); + } +} + +// --------------------------------------------------------------------------- +// 3. annotated_forest +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 3: annotated_forest") { + + struct test_setup { + nonterminals nts; + prods ps, S; + char_class_fns cc{}; + grammar g; + parser p; + + test_setup() : S(nts("S")), g(build_g()), p(g) {} + grammar build_g() { + ps(S, prods('a')); + return grammar(nts, ps, S, cc); + } + }; + + TEST_CASE("label access and has_label") { + test_setup ts; + std::string input = "a"; + auto res = ts.p.parse(input.c_str(), input.size()); + REQUIRE(res.found); + auto& f = *res.get_forest(); + annotated_forest::pnode, int> af(f); + + CHECK(!af.has_label(f.root())); + af.label(f.root()) = 42; + CHECK(af.has_label(f.root())); + CHECK(af.label(f.root()) == 42); + } + + TEST_CASE("propagate returns true when a child label changes") { + test_setup ts; + std::string input = "a"; + auto res = ts.p.parse(input.c_str(), input.size()); + REQUIRE(res.found); + auto& f = *res.get_forest(); + annotated_forest::pnode, int> af(f); + + af.label(f.root()) = 1; + bool changed = af.propagate([](int p, int c) { return p + c; }); + CHECK(changed); // at least one child should have changed from 0 to 1 + } +} + +// --------------------------------------------------------------------------- +// 6. grammar::derive_all (unit-rule propagation) +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 6: grammar::derive_all") { + + // Grammar: S -> A, A -> B (unit rules). Seed B at pos 0. + // Expected: A and S also derived at span [0,1]. + TEST_CASE("unit rule chain: S->A->B; seed B derives A and S") { + nonterminals nts; + prods ps, S(nts("S")), A(nts("A")), B(nts("B")); + char_class_fns cc{}; + ps(S, A); // S -> A + ps(A, B); // A -> B + ps(B, prods('x')); // B -> 'x' (anchor) + grammar g(nts, ps, S, cc); + + // Build lit objects for B, A, S + auto lB = g.nt(std::string("B")); // nonterminal lit for B + auto lA = g.nt(std::string("A")); + auto lS = g.nt(std::string("S")); + + std::vector, size_t>> seeds = { {lB, 0} }; + auto derived = g.derive_all(seeds); + + bool found_B = false, found_A = false, found_S = false; + for (auto& [l, sp] : derived) { + if (sp[0] != 0 || sp[1] != 1) continue; + if (l == lB) found_B = true; + if (l == lA) found_A = true; + if (l == lS) found_S = true; + } + CHECK(found_B); + CHECK(found_A); + CHECK(found_S); + } + + TEST_CASE("disconnected nonterminal C not derived from B seed") { + nonterminals nts; + prods ps, S(nts("S")), A(nts("A")), B(nts("B")), C(nts("C")); + char_class_fns cc{}; + ps(S, A); ps(A, B); ps(B, prods('x')); + // C is isolated — no production references it as head from B + grammar g(nts, ps, S, cc); + + auto lB = g.nt(std::string("B")); + auto lC = g.nt(std::string("C")); + auto derived = g.derive_all({ {lB, 0} }); + + bool found_C = false; + for (auto& [l, sp] : derived) + if (l == lC) found_C = true; + CHECK(!found_C); + } +} + +// --------------------------------------------------------------------------- +// 7. pattern_matcher_guarded +// --------------------------------------------------------------------------- + +TEST_SUITE("enhancement 7: pattern_matcher_guarded") { + + TEST_CASE("guarded rule fires when guard passes") { + tref root = n('a', { n('c') }); + tref pattern = n('a', { n('X') }); + tref rhs = n('b', { n('X') }); + rule r = make_rule(pattern, rhs); + is_capture_ch ic; + auto guard = [](const environment&, tref) { return true; }; + tref result = apply_rule_guarded(r, root, ic, guard); + CHECK(cmp(result, n('b', { n('c') }))); + } + + TEST_CASE("guarded rule does NOT fire when guard rejects") { + tref root = n('a', { n('c') }); + tref pattern = n('a', { n('X') }); + tref rhs = n('b', { n('X') }); + rule r = make_rule(pattern, rhs); + is_capture_ch ic; + auto guard = [](const environment&, tref) { return false; }; + tref result = apply_rule_guarded(r, root, ic, guard); + CHECK(cmp(result, root)); + } + + TEST_CASE("guard can inspect capture bindings") { + tref pattern = n('a', { n('X') }); + tref rhs = n('b', { n('X') }); + rule r = make_rule(pattern, rhs); + is_capture_ch ic; + + // The guard receives the binding environment: X_tref -> captured_tref. + // Accept only when X was bound to 'd'. + tref d = n('d'); + tref x_pt = n('X'); // same interned node as the X in the pattern + auto guard = [&](const environment& env, tref) -> bool { + auto it = env.find(x_pt); + return it != env.end() && cmp(it->second, d); + }; + + tref root_d = n('a', { n('d') }); // X binds to 'd' → guard passes + tref root_c = n('a', { n('c') }); // X binds to 'c' → guard fails + + tref res_d = apply_rule_guarded(r, root_d, ic, guard); + tref res_c = apply_rule_guarded(r, root_c, ic, guard); + + CHECK(cmp(res_d, n('b', { n('d') }))); // transformed + CHECK(cmp(res_c, root_c)); // unchanged + } +}