Skip to content

Commit a2b408e

Browse files
Merge branch 'main' into dev/asolovev_spmd_cpu
2 parents cd72243 + 97f37b5 commit a2b408e

File tree

6 files changed

+313
-40
lines changed

6 files changed

+313
-40
lines changed

.ci/env/apt.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function add_repo {
3232

3333
function install_dpcpp {
3434
# DPC++ compiler version monitored by Renovate and sets exact value available via apt
35-
sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp=2025.3.0-639 intel-oneapi-runtime-libs
35+
sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp=2025.3.1-760 intel-oneapi-runtime-libs
3636
}
3737

3838
function install_tbb {

MODULE.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ module(name = "onedal")
1919
bazel_dep(name = "platforms", version = "1.0.0")
2020
bazel_dep(name = "bazel_skylib", version = "1.8.2")
2121
bazel_dep(name = "rules_cc", version = "0.2.13")
22-
bazel_dep(name = "fmt", version = "12.0.0")
22+
bazel_dep(name = "fmt", version = "12.1.0")
2323

2424
declare_onedal_config = use_repo_rule("@onedal//dev/bazel/config:config.bzl", "declare_onedal_config")
2525
declare_onedal_config(name = "config",)

cpp/daal/src/algorithms/covariance/covariance_impl.i

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,7 @@ services::Status finalizeCovariance(size_t nFeatures, algorithmFPType nObservati
674674
}
675675

676676
/* Calculate resulting mean vector */
677+
PRAGMA_OMP_SIMD
677678
for (size_t i = 0; i < nFeatures; i++)
678679
{
679680
mean[i] = sums[i] * invNObservations;
@@ -690,6 +691,7 @@ services::Status finalizeCovariance(size_t nFeatures, algorithmFPType nObservati
690691

691692
for (size_t i = 0; i < nFeatures; i++)
692693
{
694+
PRAGMA_OMP_SIMD
693695
for (size_t j = 0; j < i; j++)
694696
{
695697
cov[i * nFeatures + j] = crossProduct[i * nFeatures + j] * diagInvSqrts[i] * diagInvSqrts[j];
@@ -702,6 +704,7 @@ services::Status finalizeCovariance(size_t nFeatures, algorithmFPType nObservati
702704
/* Calculate resulting covariance matrix */
703705
for (size_t i = 0; i < nFeatures; i++)
704706
{
707+
PRAGMA_OMP_SIMD
705708
for (size_t j = 0; j <= i; j++)
706709
{
707710
cov[i * nFeatures + j] = crossProduct[i * nFeatures + j] * multiplier;
@@ -712,6 +715,7 @@ services::Status finalizeCovariance(size_t nFeatures, algorithmFPType nObservati
712715
/* Copy results into symmetric upper triangle */
713716
for (size_t i = 0; i < nFeatures; i++)
714717
{
718+
PRAGMA_OMP_SIMD
715719
for (size_t j = 0; j < i; j++)
716720
{
717721
cov[j * nFeatures + i] = cov[i * nFeatures + j];

cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,22 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreesWithoutCon
425425
return Status();
426426
}
427427

428+
/*
429+
// Predict by one [sub-]tree in parallel for all rows in data set.
430+
// The data is split into blocks and each block is processed in parallel.
431+
//
432+
// @param[in] aX Pointer to the input data set
433+
// @param[in] aNode Pointer to the tree node array
434+
// @param[in] treeSize Number of nodes in the [sub-]tree
435+
// @param[in] nBlocks Number of data blocks
436+
// @param[in] nCols Number of features in the data set
437+
// @param[in] blockSize Number of rows in one data block
438+
// @param[in] residualSize Number of rows in the last data block (if any)
439+
// @param[out] prob Pointer to the class probabilities array
440+
// @param[in] iTree Index of the tree in the forest
441+
//
442+
// @return Status of computations
443+
*/
428444
template <typename algorithmFPType, CpuType cpu>
429445
Status PredictClassificationTask<algorithmFPType, cpu>::parallelPredict(const algorithmFPType * const aX, const DecisionTreeNode * const aNode,
430446
const size_t treeSize, const size_t nBlocks, const size_t nCols,
@@ -498,6 +514,21 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreeCommon(cons
498514
return Status();
499515
}
500516

517+
/*
518+
// Predict by one [sub-]tree for a block of data points.
519+
// Generic template implementation for all supported data types and various instruction set architectures.
520+
//
521+
// @param[in] x Pointer to the input data block
522+
// @param[in] sizeOfBlock Number of rows in the data block
523+
// @param[in] nCols Number of features in the data set
524+
// @param[in] tFI Pointer to the array of feature indices for tree nodes
525+
// @param[in] tLC Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
526+
// @param[in] tFV Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][tFI[j]] > tFV[j])
527+
// @param[out] prob Pointer to the class probabilities array
528+
// @param[in] iTree Index of the tree in the forest
529+
//
530+
// @return Status of computations
531+
*/
501532
template <typename algorithmFPType, CpuType cpu>
502533
DAAL_FORCEINLINE Status PredictClassificationTask<algorithmFPType, cpu>::predictByTree(const algorithmFPType * const x, const size_t sizeOfBlock,
503534
const size_t nCols, const featureIndexType * const tFI,
@@ -510,6 +541,21 @@ DAAL_FORCEINLINE Status PredictClassificationTask<algorithmFPType, cpu>::predict
510541

511542
#if defined(__AVX512F__) && defined(DAAL_INTEL_CPP_COMPILER)
512543

544+
/*
545+
// Predict by one [sub-]tree for a block of data points.
546+
// Template specialization for single precision data and AVX512 ISA
547+
//
548+
// @param[in] x Pointer to the input data block
549+
// @param[in] sizeOfBlock Number of rows in the data block
550+
// @param[in] nCols Number of features in the data set
551+
// @param[in] feat_idx Pointer to the array of feature indices for tree nodes
552+
// @param[in] left_son Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
553+
// @param[in] split_point Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][feat_idx[j]] > split_point[k][j])
554+
// @param[out] resPtr Pointer to the class probabilities array
555+
// @param[in] iTree Index of the tree in the forest
556+
//
557+
// @return Status of computations
558+
*/
513559
template <>
514560
DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictByTree(const float * const x, const size_t sizeOfBlock, const size_t nCols,
515561
const featureIndexType * const feat_idx,
@@ -572,6 +618,21 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictByTree(
572618
}
573619
}
574620

621+
/*
622+
// Predict by one [sub-]tree for a block of data points.
623+
// Template specialization for double precision data and AVX512 ISA
624+
//
625+
// @param[in] x Pointer to the input data block
626+
// @param[in] sizeOfBlock Number of rows in the data block
627+
// @param[in] nCols Number of features in the data set
628+
// @param[in] feat_idx Pointer to the array of feature indices for tree nodes
629+
// @param[in] left_son Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
630+
// @param[in] split_point Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][feat_idx[j]] > split_point[k][j])
631+
// @param[out] resPtr Pointer to the class probabilities array
632+
// @param[in] iTree Index of the tree in the forest
633+
//
634+
// @return Status of computations
635+
*/
575636
template <>
576637
DAAL_FORCEINLINE Status PredictClassificationTask<double, avx512>::predictByTree(const double * const x, const size_t sizeOfBlock, const size_t nCols,
577638
const featureIndexType * const feat_idx,
@@ -631,14 +692,23 @@ DAAL_FORCEINLINE Status PredictClassificationTask<double, avx512>::predictByTree
631692
}
632693
#endif // if defined(__AVX512F__) && defined(DAAL_INTEL_CPP_COMPILER)
633694

695+
/*
696+
// Predict by all trees for all rows in data set.
697+
//
698+
// Parallelism is organized in two levels:
699+
//
700+
// 1) Data set is split into blocks and the outer parallel loop processes the blocks.
701+
// 2) For each block, the inner parallel loop computes the predictions in parallel
702+
// for each row across all trees in the forest.
703+
*/
634704
template <typename algorithmFPType, CpuType cpu>
635705
Status PredictClassificationTask<algorithmFPType, cpu>::predictByAllTrees(const size_t nTreesTotal, const DimType & dim)
636706
{
637707
WriteOnlyRows<algorithmFPType, cpu> resBD(_res, 0, dim.nRowsTotal);
638708
DAAL_CHECK_BLOCK_STATUS(resBD);
639709
WriteOnlyRows<algorithmFPType, cpu> probBD(_prob, 0, dim.nRowsTotal);
640710
DAAL_CHECK_BLOCK_STATUS(probBD);
641-
const bool bUseTLS(_nClasses > s_cMaxClassesBufSize);
711+
const bool bUseTLS(_nClasses > s_cMaxClassesBufSize); //// check if dynamically allocated local storage is needed
642712
const size_t nCols(_data->getNumberOfColumns());
643713
daal::SafeStatus safeStat;
644714
algorithmFPType * const probPtr = probBD.get();
@@ -665,6 +735,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByAllTrees(const
665735
}
666736
else
667737
{
738+
// Dynamically allocated thread-local storage for class counters
668739
ClassesCounterTls lsData(_nClasses);
669740
daal::threader_for(dim.nDataBlocks, dim.nDataBlocks, [&](const size_t iBlock) {
670741
const size_t iStartRow = iBlock * dim.nRowsInBlock;
@@ -1002,6 +1073,31 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictOneRowB
10021073
}
10031074
#endif
10041075

1076+
/*
1077+
// Predicts classes for all input data points using all trees in the forest.
1078+
//
1079+
// The computations are done in two steps:
1080+
//
1081+
// Variant A (disabled due to performance issues):
1082+
// 1) Parallel prediction over trees:
1083+
// Each thread processes a subset of trees and accumulates the results in
1084+
// thread-local storage.
1085+
// 2) The results from different threads are merged by summing the probabilities
1086+
// for each class across all threads and computing the final predictions
1087+
// as the class with the maximum probability.
1088+
//
1089+
// Variant B (used currently):
1090+
// 1) Sequential prediction over trees:
1091+
// The trees are processed one after another, with each tree's predictions being
1092+
// computed and accumulated before moving on to the next tree.
1093+
// When computing the predictions for each tree, parallelism is used over data points.
1094+
// 2) After all trees have been processed, the final predictions are computed
1095+
// as the class with the maximum accumulated probability.
1096+
//
1097+
// @param[in] nTreesTotal Total number of trees in the forest
1098+
//
1099+
// @return Status of computation
1100+
*/
10051101
template <typename algorithmFPType, CpuType cpu>
10061102
Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTrees(size_t nTreesTotal)
10071103
{
@@ -1037,8 +1133,13 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10371133
// (excessive memory and CPU resources usage), especially on systems with high number of cores
10381134
if (false)
10391135
{
1136+
// Variant A: Parallel prediction over trees with thread-local storage
1137+
1138+
// Use thread local storage to accumulate results from different trees:
1139+
// Each thread stores the results for all rows of the input data, but only for the subset of trees processed by the thread.
10401140
daal::static_tls<algorithmFPType *> tlsData([=]() { return service_scalable_calloc<algorithmFPType, cpu>(_nClasses * nRowsOfRes); });
10411141

1142+
// Parallel prediction over trees
10421143
daal::static_threader_for(numberOfTrees, [&, nCols](const size_t iTree, size_t tid) {
10431144
const size_t treeSize = _aTree[iTree]->getNumberOfRows();
10441145
const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray();
@@ -1049,13 +1150,15 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10491150
const size_t localBlockSize = 256; // TODO: Why can't this be the class value _blockSize?
10501151
const size_t nBlocks = nRowsOfRes / localBlockSize + !!(nRowsOfRes % localBlockSize);
10511152

1153+
// Merge results from different threads by summing the class counters for each class across all threads
10521154
daal::threader_for(nBlocks, nBlocks, [&](const size_t iBlock) {
10531155
const size_t begin = iBlock * localBlockSize;
10541156
const size_t end = services::internal::min<cpu, size_t>(nRowsOfRes, begin + localBlockSize);
10551157

10561158
services::internal::service_memset_seq<algorithmFPType, cpu>(commonBufVal + begin * _nClasses, algorithmFPType(0),
10571159
(end - begin) * _nClasses);
10581160

1161+
// Sum class counters results from different threads
10591162
for (size_t tid = 0; tid < nThreads; ++tid)
10601163
{
10611164
algorithmFPType * buf = tlsData.local(tid);
@@ -1071,6 +1174,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10711174

10721175
if (prob != nullptr)
10731176
{
1177+
// Normalize to get class probabilities
10741178
for (size_t i = begin; i < end; ++i)
10751179
{
10761180
algorithmFPType sum(0);
@@ -1103,12 +1207,14 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11031207
}
11041208
else
11051209
{
1210+
// Variant B: Sequential prediction over trees with parallelism over data points
11061211
services::internal::service_memset<algorithmFPType, cpu>(commonBufVal, algorithmFPType(0), nRowsOfRes * _nClasses);
11071212

11081213
for (size_t iTree = 0; iTree < numberOfTrees; ++iTree)
11091214
{
11101215
const size_t treeSize = _aTree[iTree]->getNumberOfRows();
11111216
const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray();
1217+
// Predict using the current tree, split the work over data points in parallel
11121218
parallelPredict(aX, aNode, treeSize, nBlocks, nCols, _blockSize, residualSize, commonBufVal, iTree);
11131219
}
11141220
if (prob != nullptr || res != nullptr)
@@ -1122,6 +1228,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11221228

11231229
if (prob != nullptr)
11241230
{
1231+
// Normalize to get class probabilities
11251232
for (size_t i = begin; i < end; ++i)
11261233
{
11271234
algorithmFPType sum(0);
@@ -1143,6 +1250,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11431250

11441251
if (res != nullptr)
11451252
{
1253+
// Determine predicted classes as those with maximum votes/probabilities
11461254
for (size_t i = begin; i < end; ++i)
11471255
{
11481256
res[i] = algorithmFPType(getMaxClass(commonBufVal + i * _nClasses));

0 commit comments

Comments
 (0)