@@ -425,6 +425,22 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreesWithoutCon
425425 return Status ();
426426}
427427
428+ /*
429+ // Predict by one [sub-]tree in parallel for all rows in data set.
430+ // The data is split into blocks and each block is processed in parallel.
431+ //
432+ // @param[in] aX Pointer to the input data set
433+ // @param[in] aNode Pointer to the tree node array
434+ // @param[in] treeSize Number of nodes in the [sub-]tree
435+ // @param[in] nBlocks Number of data blocks
436+ // @param[in] nCols Number of features in the data set
437+ // @param[in] blockSize Number of rows in one data block
438+ // @param[in] residualSize Number of rows in the last data block (if any)
439+ // @param[out] prob Pointer to the class probabilities array
440+ // @param[in] iTree Index of the tree in the forest
441+ //
442+ // @return Status of computations
443+ */
428444template <typename algorithmFPType, CpuType cpu>
429445Status PredictClassificationTask<algorithmFPType, cpu>::parallelPredict(const algorithmFPType * const aX, const DecisionTreeNode * const aNode,
430446 const size_t treeSize, const size_t nBlocks, const size_t nCols,
@@ -498,6 +514,21 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreeCommon(cons
498514 return Status ();
499515}
500516
517+ /*
518+ // Predict by one [sub-]tree for a block of data points.
519+ // Generic template implementation for all supported data types and various instruction set architectures.
520+ //
521+ // @param[in] x Pointer to the input data block
522+ // @param[in] sizeOfBlock Number of rows in the data block
523+ // @param[in] nCols Number of features in the data set
524+ // @param[in] tFI Pointer to the array of feature indices for tree nodes
525+ // @param[in] tLC Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
526+ // @param[in] tFV Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][tFI[j]] > tFV[j])
527+ // @param[out] prob Pointer to the class probabilities array
528+ // @param[in] iTree Index of the tree in the forest
529+ //
530+ // @return Status of computations
531+ */
501532template <typename algorithmFPType, CpuType cpu>
502533DAAL_FORCEINLINE Status PredictClassificationTask<algorithmFPType, cpu>::predictByTree(const algorithmFPType * const x, const size_t sizeOfBlock,
503534 const size_t nCols, const featureIndexType * const tFI,
@@ -510,6 +541,21 @@ DAAL_FORCEINLINE Status PredictClassificationTask<algorithmFPType, cpu>::predict
510541
511542#if defined(__AVX512F__) && defined(DAAL_INTEL_CPP_COMPILER)
512543
544+ /*
545+ // Predict by one [sub-]tree for a block of data points.
546+ // Template specialization for single precision data and AVX512 ISA
547+ //
548+ // @param[in] x Pointer to the input data block
549+ // @param[in] sizeOfBlock Number of rows in the data block
550+ // @param[in] nCols Number of features in the data set
551+ // @param[in] feat_idx Pointer to the array of feature indices for tree nodes
552+ // @param[in] left_son Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
553+ // @param[in] split_point Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][feat_idx[j]] > split_point[k][j])
554+ // @param[out] resPtr Pointer to the class probabilities array
555+ // @param[in] iTree Index of the tree in the forest
556+ //
557+ // @return Status of computations
558+ */
513559template <>
514560DAAL_FORCEINLINE Status PredictClassificationTask<float , avx512>::predictByTree(const float * const x, const size_t sizeOfBlock, const size_t nCols,
515561 const featureIndexType * const feat_idx,
@@ -572,6 +618,21 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictByTree(
572618 }
573619}
574620
621+ /*
622+ // Predict by one [sub-]tree for a block of data points.
623+ // Template specialization for double precision data and AVX512 ISA
624+ //
625+ // @param[in] x Pointer to the input data block
626+ // @param[in] sizeOfBlock Number of rows in the data block
627+ // @param[in] nCols Number of features in the data set
628+ // @param[in] feat_idx Pointer to the array of feature indices for tree nodes
629+ // @param[in] left_son Pointer to the array of left child indices for split nodes, or class indices for leaf nodes
630+ // @param[in] split_point Pointer to the array of feature values for nodes, i.e. the values that define the splits (x[k][feat_idx[j]] > split_point[k][j])
631+ // @param[out] resPtr Pointer to the class probabilities array
632+ // @param[in] iTree Index of the tree in the forest
633+ //
634+ // @return Status of computations
635+ */
575636template <>
576637DAAL_FORCEINLINE Status PredictClassificationTask<double , avx512>::predictByTree(const double * const x, const size_t sizeOfBlock, const size_t nCols,
577638 const featureIndexType * const feat_idx,
@@ -631,14 +692,23 @@ DAAL_FORCEINLINE Status PredictClassificationTask<double, avx512>::predictByTree
631692}
632693#endif // if defined(__AVX512F__) && defined(DAAL_INTEL_CPP_COMPILER)
633694
695+ /*
696+ // Predict by all trees for all rows in data set.
697+ //
698+ // Parallelism is organized in two levels:
699+ //
700+ // 1) Data set is split into blocks and the outer parallel loop processes the blocks.
701+ // 2) For each block, the inner parallel loop computes the predictions in parallel
702+ // for each row across all trees in the forest.
703+ */
634704template <typename algorithmFPType, CpuType cpu>
635705Status PredictClassificationTask<algorithmFPType, cpu>::predictByAllTrees(const size_t nTreesTotal, const DimType & dim)
636706{
637707 WriteOnlyRows<algorithmFPType, cpu> resBD (_res, 0 , dim.nRowsTotal );
638708 DAAL_CHECK_BLOCK_STATUS (resBD);
639709 WriteOnlyRows<algorithmFPType, cpu> probBD (_prob, 0 , dim.nRowsTotal );
640710 DAAL_CHECK_BLOCK_STATUS (probBD);
641- const bool bUseTLS (_nClasses > s_cMaxClassesBufSize);
711+ const bool bUseTLS (_nClasses > s_cMaxClassesBufSize); // // check if dynamically allocated local storage is needed
642712 const size_t nCols (_data->getNumberOfColumns ());
643713 daal::SafeStatus safeStat;
644714 algorithmFPType * const probPtr = probBD.get ();
@@ -665,6 +735,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByAllTrees(const
665735 }
666736 else
667737 {
738+ // Dynamically allocated thread-local storage for class counters
668739 ClassesCounterTls lsData (_nClasses);
669740 daal::threader_for (dim.nDataBlocks , dim.nDataBlocks , [&](const size_t iBlock) {
670741 const size_t iStartRow = iBlock * dim.nRowsInBlock ;
@@ -1002,6 +1073,31 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictOneRowB
10021073}
10031074#endif
10041075
1076+ /*
1077+ // Predicts classes for all input data points using all trees in the forest.
1078+ //
1079+ // The computations are done in two steps:
1080+ //
1081+ // Variant A (disabled due to performance issues):
1082+ // 1) Parallel prediction over trees:
1083+ // Each thread processes a subset of trees and accumulates the results in
1084+ // thread-local storage.
1085+ // 2) The results from different threads are merged by summing the probabilities
1086+ // for each class across all threads and computing the final predictions
1087+ // as the class with the maximum probability.
1088+ //
1089+ // Variant B (used currently):
1090+ // 1) Sequential prediction over trees:
1091+ // The trees are processed one after another, with each tree's predictions being
1092+ // computed and accumulated before moving on to the next tree.
1093+ // When computing the predictions for each tree, parallelism is used over data points.
1094+ // 2) After all trees have been processed, the final predictions are computed
1095+ // as the class with the maximum accumulated probability.
1096+ //
1097+ // @param[in] nTreesTotal Total number of trees in the forest
1098+ //
1099+ // @return Status of computation
1100+ */
10051101template <typename algorithmFPType, CpuType cpu>
10061102Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTrees(size_t nTreesTotal)
10071103{
@@ -1037,8 +1133,13 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10371133 // (excessive memory and CPU resources usage), especially on systems with high number of cores
10381134 if (false )
10391135 {
1136+ // Variant A: Parallel prediction over trees with thread-local storage
1137+
1138+ // Use thread local storage to accumulate results from different trees:
1139+ // Each thread stores the results for all rows of the input data, but only for the subset of trees processed by the thread.
10401140 daal::static_tls<algorithmFPType *> tlsData ([=]() { return service_scalable_calloc<algorithmFPType, cpu>(_nClasses * nRowsOfRes); });
10411141
1142+ // Parallel prediction over trees
10421143 daal::static_threader_for (numberOfTrees, [&, nCols](const size_t iTree, size_t tid) {
10431144 const size_t treeSize = _aTree[iTree]->getNumberOfRows ();
10441145 const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray ();
@@ -1049,13 +1150,15 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10491150 const size_t localBlockSize = 256 ; // TODO: Why can't this be the class value _blockSize?
10501151 const size_t nBlocks = nRowsOfRes / localBlockSize + !!(nRowsOfRes % localBlockSize);
10511152
1153+ // Merge results from different threads by summing the class counters for each class across all threads
10521154 daal::threader_for (nBlocks, nBlocks, [&](const size_t iBlock) {
10531155 const size_t begin = iBlock * localBlockSize;
10541156 const size_t end = services::internal::min<cpu, size_t >(nRowsOfRes, begin + localBlockSize);
10551157
10561158 services::internal::service_memset_seq<algorithmFPType, cpu>(commonBufVal + begin * _nClasses, algorithmFPType (0 ),
10571159 (end - begin) * _nClasses);
10581160
1161+ // Sum class counters results from different threads
10591162 for (size_t tid = 0 ; tid < nThreads; ++tid)
10601163 {
10611164 algorithmFPType * buf = tlsData.local (tid);
@@ -1071,6 +1174,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
10711174
10721175 if (prob != nullptr )
10731176 {
1177+ // Normalize to get class probabilities
10741178 for (size_t i = begin; i < end; ++i)
10751179 {
10761180 algorithmFPType sum (0 );
@@ -1103,12 +1207,14 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11031207 }
11041208 else
11051209 {
1210+ // Variant B: Sequential prediction over trees with parallelism over data points
11061211 services::internal::service_memset<algorithmFPType, cpu>(commonBufVal, algorithmFPType (0 ), nRowsOfRes * _nClasses);
11071212
11081213 for (size_t iTree = 0 ; iTree < numberOfTrees; ++iTree)
11091214 {
11101215 const size_t treeSize = _aTree[iTree]->getNumberOfRows ();
11111216 const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray ();
1217+ // Predict using the current tree, split the work over data points in parallel
11121218 parallelPredict (aX, aNode, treeSize, nBlocks, nCols, _blockSize, residualSize, commonBufVal, iTree);
11131219 }
11141220 if (prob != nullptr || res != nullptr )
@@ -1122,6 +1228,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11221228
11231229 if (prob != nullptr )
11241230 {
1231+ // Normalize to get class probabilities
11251232 for (size_t i = begin; i < end; ++i)
11261233 {
11271234 algorithmFPType sum (0 );
@@ -1143,6 +1250,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
11431250
11441251 if (res != nullptr )
11451252 {
1253+ // Determine predicted classes as those with maximum votes/probabilities
11461254 for (size_t i = begin; i < end; ++i)
11471255 {
11481256 res[i] = algorithmFPType (getMaxClass (commonBufVal + i * _nClasses));
0 commit comments