179 lines
4.8 KiB
Cython
179 lines
4.8 KiB
Cython
# Authors: The scikit-learn developers
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
# See _partitioner.pyx for details.
|
|
|
|
from ..utils._typedefs cimport (
|
|
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
|
|
)
|
|
from ._splitter cimport SplitRecord
|
|
|
|
|
|
# Mitigate precision differences between 32 bit and 64 bit
|
|
cdef float32_t FEATURE_THRESHOLD = 1e-7
|
|
|
|
|
|
# We provide here the abstract interface for a Partitioner that would be
|
|
# theoretically shared between the Dense and Sparse partitioners. However,
|
|
# we leave it commented out for now as it is not used in the current
|
|
# implementation due to the performance hit from vtable lookups when using
|
|
# inheritance based polymorphism. It is left here for future reference.
|
|
#
|
|
# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used
|
|
# to represent both the dense and sparse partitioners.
|
|
#
|
|
# cdef class BasePartitioner:
|
|
# cdef intp_t[::1] samples
|
|
# cdef float32_t[::1] feature_values
|
|
# cdef intp_t start
|
|
# cdef intp_t end
|
|
# cdef intp_t n_missing
|
|
# cdef const uint8_t[::1] missing_values_in_feature_mask
|
|
|
|
# cdef void sort_samples_and_feature_values(
|
|
# self, intp_t current_feature
|
|
# ) noexcept nogil
|
|
# cdef void init_node_split(
|
|
# self,
|
|
# intp_t start,
|
|
# intp_t end
|
|
# ) noexcept nogil
|
|
# cdef void find_min_max(
|
|
# self,
|
|
# intp_t current_feature,
|
|
# float32_t* min_feature_value_out,
|
|
# float32_t* max_feature_value_out,
|
|
# ) noexcept nogil
|
|
# cdef void next_p(
|
|
# self,
|
|
# intp_t* p_prev,
|
|
# intp_t* p
|
|
# ) noexcept nogil
|
|
# cdef intp_t partition_samples(
|
|
# self,
|
|
# float64_t current_threshold
|
|
# ) noexcept nogil
|
|
# cdef void partition_samples_final(
|
|
# self,
|
|
# intp_t best_pos,
|
|
# float64_t best_threshold,
|
|
# intp_t best_feature,
|
|
# intp_t n_missing,
|
|
# ) noexcept nogil
|
|
|
|
|
|
cdef class DensePartitioner:
|
|
"""Partitioner specialized for dense data.
|
|
|
|
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
|
"""
|
|
cdef const float32_t[:, :] X
|
|
cdef intp_t[::1] samples
|
|
cdef float32_t[::1] feature_values
|
|
cdef intp_t start
|
|
cdef intp_t end
|
|
cdef intp_t n_missing
|
|
cdef const uint8_t[::1] missing_values_in_feature_mask
|
|
|
|
cdef void sort_samples_and_feature_values(
|
|
self, intp_t current_feature
|
|
) noexcept nogil
|
|
cdef void init_node_split(
|
|
self,
|
|
intp_t start,
|
|
intp_t end
|
|
) noexcept nogil
|
|
cdef void find_min_max(
|
|
self,
|
|
intp_t current_feature,
|
|
float32_t* min_feature_value_out,
|
|
float32_t* max_feature_value_out,
|
|
) noexcept nogil
|
|
cdef void next_p(
|
|
self,
|
|
intp_t* p_prev,
|
|
intp_t* p
|
|
) noexcept nogil
|
|
cdef intp_t partition_samples(
|
|
self,
|
|
float64_t current_threshold
|
|
) noexcept nogil
|
|
cdef void partition_samples_final(
|
|
self,
|
|
intp_t best_pos,
|
|
float64_t best_threshold,
|
|
intp_t best_feature,
|
|
intp_t n_missing,
|
|
) noexcept nogil
|
|
|
|
|
|
cdef class SparsePartitioner:
|
|
"""Partitioner specialized for sparse CSC data.
|
|
|
|
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
|
"""
|
|
cdef const float32_t[::1] X_data
|
|
cdef const int32_t[::1] X_indices
|
|
cdef const int32_t[::1] X_indptr
|
|
cdef intp_t n_total_samples
|
|
cdef intp_t[::1] index_to_samples
|
|
cdef intp_t[::1] sorted_samples
|
|
cdef intp_t start_positive
|
|
cdef intp_t end_negative
|
|
cdef bint is_samples_sorted
|
|
|
|
cdef intp_t[::1] samples
|
|
cdef float32_t[::1] feature_values
|
|
cdef intp_t start
|
|
cdef intp_t end
|
|
cdef intp_t n_missing
|
|
cdef const uint8_t[::1] missing_values_in_feature_mask
|
|
|
|
cdef void sort_samples_and_feature_values(
|
|
self, intp_t current_feature
|
|
) noexcept nogil
|
|
cdef void init_node_split(
|
|
self,
|
|
intp_t start,
|
|
intp_t end
|
|
) noexcept nogil
|
|
cdef void find_min_max(
|
|
self,
|
|
intp_t current_feature,
|
|
float32_t* min_feature_value_out,
|
|
float32_t* max_feature_value_out,
|
|
) noexcept nogil
|
|
cdef void next_p(
|
|
self,
|
|
intp_t* p_prev,
|
|
intp_t* p
|
|
) noexcept nogil
|
|
cdef intp_t partition_samples(
|
|
self,
|
|
float64_t current_threshold
|
|
) noexcept nogil
|
|
cdef void partition_samples_final(
|
|
self,
|
|
intp_t best_pos,
|
|
float64_t best_threshold,
|
|
intp_t best_feature,
|
|
intp_t n_missing,
|
|
) noexcept nogil
|
|
|
|
cdef void extract_nnz(
|
|
self,
|
|
intp_t feature
|
|
) noexcept nogil
|
|
cdef intp_t _partition(
|
|
self,
|
|
float64_t threshold,
|
|
intp_t zero_pos
|
|
) noexcept nogil
|
|
|
|
|
|
cdef void shift_missing_values_to_left_if_required(
|
|
SplitRecord* best,
|
|
intp_t[::1] samples,
|
|
intp_t end,
|
|
) noexcept nogil
|