Skip to content

Commit 5972247

Browse files
committed
perf(dataset): optimize to_binned() with SIMD unpacking and parallel conversion
Rewrote PackedDataset::to_binned() to leverage the SIMD 4-bit unpacking kernel efficiently: - Pre-allocate full output buffer once (eliminates per-feature allocations) - Use copy_from_slice() for U8 features (direct memcpy) - Use unpack_to_buffer() for packed features (SIMD-accelerated) - Parallel unpacking via Rayon when num_features >= 8 Added benchmark suite for conversion at 10k×20, 100k×20, 100k×100 scales to validate performance gains.
1 parent 1e71337 commit 5972247

4 files changed

Lines changed: 194 additions & 57 deletions

File tree

.gitignore

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,28 @@ Thumbs.db
8181
.Spotlight-V100
8282
.Trashes
8383
resources/*
84-
catboost_info
84+
catboost_info
85+
86+
87+
# Python
88+
__pycache__/
89+
*.py[cod]
90+
*$py.class
91+
*.so
92+
*.pyd
93+
.Python
94+
build/
95+
develop-eggs/
96+
dist/
97+
downloads/
98+
eggs/
99+
.eggs/
100+
lib/
101+
lib64/
102+
parts/
103+
sdist/
104+
var/
105+
wheels/
106+
*.egg-info/
107+
.installed.cfg
108+
*.egg

benches/profile.rs

Lines changed: 129 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ fn to_treeboost_dataset(
148148
fn benchmark_prediction_components(c: &mut Criterion) {
149149
let mut group = c.benchmark_group("PredictionProfile");
150150
group.measurement_time(Duration::from_secs(5));
151-
group.sample_size(100);
151+
group.sample_size(20);
152152

153153
let num_features = 10;
154154
let num_rounds = 50;
@@ -157,7 +157,8 @@ fn benchmark_prediction_components(c: &mut Criterion) {
157157

158158
// Train model
159159
let (train_features, train_targets) = generate_data(train_rows, num_features, 42);
160-
let train_data = to_treeboost_dataset(&train_features, &train_targets, train_rows, num_features);
160+
let train_data =
161+
to_treeboost_dataset(&train_features, &train_targets, train_rows, num_features);
161162
let config = GBDTConfig::new()
162163
.with_num_rounds(num_rounds)
163164
.with_max_depth(6)
@@ -235,7 +236,7 @@ fn benchmark_histogram_building(c: &mut Criterion) {
235236

236237
let mut group = c.benchmark_group("HistogramProfile");
237238
group.measurement_time(Duration::from_secs(5));
238-
group.sample_size(50);
239+
group.sample_size(20);
239240

240241
let num_rows = 100_000;
241242
let num_features = 10;
@@ -256,15 +257,11 @@ fn benchmark_histogram_building(c: &mut Criterion) {
256257
let builder = HistogramBuilder::new();
257258

258259
group.bench_function("histogram_contiguous_100k", |b| {
259-
b.iter(|| {
260-
black_box(builder.build(&dataset, &contiguous_rows, &gradients, &hessians))
261-
});
260+
b.iter(|| black_box(builder.build(&dataset, &contiguous_rows, &gradients, &hessians)));
262261
});
263262

264263
group.bench_function("histogram_sparse_50k", |b| {
265-
b.iter(|| {
266-
black_box(builder.build(&dataset, &sparse_rows, &gradients, &hessians))
267-
});
264+
b.iter(|| black_box(builder.build(&dataset, &sparse_rows, &gradients, &hessians)));
268265
});
269266

270267
// Single feature histogram - the innermost hot loop
@@ -361,10 +358,8 @@ fn benchmark_histogram_building(c: &mut Criterion) {
361358
for i in 0..remainder {
362359
let idx = *contiguous_rows.get_unchecked(base + i);
363360
let bin = *feature_col.get_unchecked(idx) as usize;
364-
bins.get_unchecked_mut(bin).accumulate(
365-
*gradients.get_unchecked(idx),
366-
*hessians.get_unchecked(idx),
367-
);
361+
bins.get_unchecked_mut(bin)
362+
.accumulate(*gradients.get_unchecked(idx), *hessians.get_unchecked(idx));
368363
}
369364
}
370365
black_box(hist)
@@ -373,49 +368,78 @@ fn benchmark_histogram_building(c: &mut Criterion) {
373368

374369
// Sparse data benchmarks (90% zeros - common in text/recommender systems)
375370
let (sparse_features, sparse_targets) = generate_sparse_data(num_rows, num_features, 0.9, 42);
376-
let sparse_dataset = to_treeboost_dataset(&sparse_features, &sparse_targets, num_rows, num_features);
371+
let sparse_dataset =
372+
to_treeboost_dataset(&sparse_features, &sparse_targets, num_rows, num_features);
377373
let sparse_gradients: Vec<f32> = (0..num_rows).map(|i| (i as f32) * 0.001).collect();
378374
let sparse_hessians: Vec<f32> = vec![1.0; num_rows];
379375

380376
// Report sparsity detected
381377
let num_sparse = sparse_dataset.num_sparse_features();
382-
eprintln!("Sparse dataset: {}/{} features detected as sparse", num_sparse, num_features);
378+
eprintln!(
379+
"Sparse dataset: {}/{} features detected as sparse",
380+
num_sparse, num_features
381+
);
383382

384383
group.bench_function("histogram_sparse_data_contiguous_100k", |b| {
385384
b.iter(|| {
386-
black_box(builder.build(&sparse_dataset, &contiguous_rows, &sparse_gradients, &sparse_hessians))
385+
black_box(builder.build(
386+
&sparse_dataset,
387+
&contiguous_rows,
388+
&sparse_gradients,
389+
&sparse_hessians,
390+
))
387391
});
388392
});
389393

390394
group.bench_function("histogram_sparse_data_indexed_50k", |b| {
391395
b.iter(|| {
392-
black_box(builder.build(&sparse_dataset, &sparse_rows, &sparse_gradients, &sparse_hessians))
396+
black_box(builder.build(
397+
&sparse_dataset,
398+
&sparse_rows,
399+
&sparse_gradients,
400+
&sparse_hessians,
401+
))
393402
});
394403
});
395404

396405
// 95% sparse (even more extreme - text data)
397-
let (very_sparse_features, very_sparse_targets) = generate_sparse_data(num_rows, num_features, 0.95, 42);
398-
let very_sparse_dataset = to_treeboost_dataset(&very_sparse_features, &very_sparse_targets, num_rows, num_features);
406+
let (very_sparse_features, very_sparse_targets) =
407+
generate_sparse_data(num_rows, num_features, 0.95, 42);
408+
let very_sparse_dataset = to_treeboost_dataset(
409+
&very_sparse_features,
410+
&very_sparse_targets,
411+
num_rows,
412+
num_features,
413+
);
399414
let num_very_sparse = very_sparse_dataset.num_sparse_features();
400-
eprintln!("Very sparse dataset: {}/{} features detected as sparse", num_very_sparse, num_features);
415+
eprintln!(
416+
"Very sparse dataset: {}/{} features detected as sparse",
417+
num_very_sparse, num_features
418+
);
401419

402420
group.bench_function("histogram_very_sparse_contiguous_100k", |b| {
403421
b.iter(|| {
404-
black_box(builder.build(&very_sparse_dataset, &contiguous_rows, &sparse_gradients, &sparse_hessians))
422+
black_box(builder.build(
423+
&very_sparse_dataset,
424+
&contiguous_rows,
425+
&sparse_gradients,
426+
&sparse_hessians,
427+
))
405428
});
406429
});
407430

408431
// Benchmark a single tree grow (the main per-round cost)
409432
group.bench_function("single_tree_grow_100k", |b| {
410-
use treeboost::tree::TreeGrower;
411433
use treeboost::loss::{LossFunction, MseLoss};
434+
use treeboost::tree::TreeGrower;
412435

413436
let loss = MseLoss::new();
414437
let predictions = vec![0.0f32; num_rows];
415438
let targets_f32: Vec<f32> = targets.iter().map(|&t| t as f32).collect();
416439

417440
// Pre-compute gradients/hessians
418-
let (grads, hess): (Vec<f32>, Vec<f32>) = targets_f32.iter()
441+
let (grads, hess): (Vec<f32>, Vec<f32>) = targets_f32
442+
.iter()
419443
.zip(&predictions)
420444
.map(|(&t, &p)| loss.gradient_hessian(t, p))
421445
.unzip();
@@ -428,9 +452,7 @@ fn benchmark_histogram_building(c: &mut Criterion) {
428452

429453
let row_indices: Vec<usize> = (0..num_rows).collect();
430454

431-
b.iter(|| {
432-
black_box(grower.grow_with_indices(&dataset, &grads, &hess, &row_indices))
433-
});
455+
b.iter(|| black_box(grower.grow_with_indices(&dataset, &grads, &hess, &row_indices)));
434456
});
435457

436458
group.finish();
@@ -647,14 +669,15 @@ fn benchmark_efb(c: &mut Criterion) {
647669

648670
let mut group = c.benchmark_group("EFBProfile");
649671
group.measurement_time(Duration::from_secs(5));
650-
group.sample_size(50);
672+
group.sample_size(20);
651673

652674
// Test case: 10 categories with 10 one-hot bins each = 100 features → should bundle to 10
653675
let num_rows = 100_000;
654676
let num_categories = 10;
655677
let bins_per_category = 10;
656678

657-
let (features, targets, feature_info) = generate_onehot_data(num_rows, num_categories, bins_per_category, 42);
679+
let (features, targets, feature_info) =
680+
generate_onehot_data(num_rows, num_categories, bins_per_category, 42);
658681
let total_features = num_categories * bins_per_category;
659682

660683
let dataset = BinnedDataset::new(num_rows, features, targets, feature_info);
@@ -678,9 +701,7 @@ fn benchmark_efb(c: &mut Criterion) {
678701
);
679702

680703
group.bench_function("bundled_dataset_creation", |b| {
681-
b.iter(|| {
682-
black_box(BundledDataset::from_dataset(&dataset, &bundling))
683-
});
704+
b.iter(|| black_box(BundledDataset::from_dataset(&dataset, &bundling)));
684705
});
685706

686707
let bundled = BundledDataset::from_dataset(&dataset, &bundling);
@@ -698,9 +719,7 @@ fn benchmark_efb(c: &mut Criterion) {
698719
let builder = HistogramBuilder::new();
699720

700721
group.bench_function("histogram_original_100_features", |b| {
701-
b.iter(|| {
702-
black_box(builder.build(&dataset, &row_indices, &gradients, &hessians))
703-
});
722+
b.iter(|| black_box(builder.build(&dataset, &row_indices, &gradients, &hessians)));
704723
});
705724

706725
// For bundled histogram, we build on each bundle column
@@ -744,22 +763,22 @@ fn benchmark_efb(c: &mut Criterion) {
744763
);
745764

746765
group.bench_function("histogram_original_500_features", |b| {
747-
b.iter(|| {
748-
black_box(builder.build(&dataset_medium, &row_indices, &gradients, &hessians))
749-
});
766+
b.iter(|| black_box(builder.build(&dataset_medium, &row_indices, &gradients, &hessians)));
750767
});
751768

752769
group.finish();
753770
}
754771

755772
fn benchmark_split_finding(c: &mut Criterion) {
756773
use treeboost::histogram::{Histogram, NodeHistograms};
774+
use treeboost::kernel::{
775+
find_best_split, find_best_split_scalar, find_best_split_simd, has_avx2,
776+
};
757777
use treeboost::tree::SplitFinder;
758-
use treeboost::kernel::{find_best_split_scalar, find_best_split_simd, find_best_split, has_avx2};
759778

760779
let mut group = c.benchmark_group("SplitFindingProfile");
761780
group.measurement_time(Duration::from_secs(5));
762-
group.sample_size(100);
781+
group.sample_size(20);
763782

764783
// Report SIMD availability
765784
eprintln!("AVX2 available: {}", has_avx2());
@@ -809,9 +828,7 @@ fn benchmark_split_finding(c: &mut Criterion) {
809828
group.bench_function("kernel_scalar_single_feature", |b| {
810829
b.iter(|| {
811830
black_box(find_best_split_scalar(
812-
&grads, &hess, &counts,
813-
total_g, total_h, total_n,
814-
1.0, 1, 1.0,
831+
&grads, &hess, &counts, total_g, total_h, total_n, 1.0, 1, 1.0,
815832
))
816833
});
817834
});
@@ -823,9 +840,7 @@ fn benchmark_split_finding(c: &mut Criterion) {
823840
b.iter(|| {
824841
black_box(unsafe {
825842
find_best_split_simd(
826-
&grads, &hess, &counts,
827-
total_g, total_h, total_n,
828-
1.0, 1, 1.0,
843+
&grads, &hess, &counts, total_g, total_h, total_n, 1.0, 1, 1.0,
829844
)
830845
})
831846
});
@@ -836,9 +851,7 @@ fn benchmark_split_finding(c: &mut Criterion) {
836851
group.bench_function("kernel_dispatched_single_feature", |b| {
837852
b.iter(|| {
838853
black_box(find_best_split(
839-
&grads, &hess, &counts,
840-
total_g, total_h, total_n,
841-
1.0, 1, 1.0,
854+
&grads, &hess, &counts, total_g, total_h, total_n, 1.0, 1, 1.0,
842855
))
843856
});
844857
});
@@ -880,13 +893,23 @@ fn benchmark_split_finding(c: &mut Criterion) {
880893

881894
group.bench_function("splitfinder_20_features_no_entropy", |b| {
882895
b.iter(|| {
883-
black_box(finder_no_entropy.find_best_split(&multi_histograms, total_g, total_h, total_n))
896+
black_box(finder_no_entropy.find_best_split(
897+
&multi_histograms,
898+
total_g,
899+
total_h,
900+
total_n,
901+
))
884902
});
885903
});
886904

887905
group.bench_function("splitfinder_20_features_with_entropy", |b| {
888906
b.iter(|| {
889-
black_box(finder_with_entropy.find_best_split(&multi_histograms, total_g, total_h, total_n))
907+
black_box(finder_with_entropy.find_best_split(
908+
&multi_histograms,
909+
total_g,
910+
total_h,
911+
total_n,
912+
))
890913
});
891914
});
892915

@@ -899,15 +922,20 @@ fn benchmark_split_finding(c: &mut Criterion) {
899922

900923
group.bench_function("splitfinder_100_features_no_entropy", |b| {
901924
b.iter(|| {
902-
black_box(finder_no_entropy.find_best_split(&wide_histograms, total_g, total_h, total_n))
925+
black_box(finder_no_entropy.find_best_split(
926+
&wide_histograms,
927+
total_g,
928+
total_h,
929+
total_n,
930+
))
903931
});
904932
});
905933

906934
group.finish();
907935
}
908936

909937
fn benchmark_4bit_unpacking(c: &mut Criterion) {
910-
use treeboost::dataset::packed::PackedColumn;
938+
use treeboost::dataset::packed::{PackedColumn, PackedDataset};
911939

912940
let mut group = c.benchmark_group("4bit_unpacking");
913941
group.measurement_time(Duration::from_secs(5));
@@ -956,7 +984,55 @@ fn benchmark_4bit_unpacking(c: &mut Criterion) {
956984
});
957985

958986
group.finish();
987+
988+
// Benchmark to_binned() conversion (PackedDataset -> BinnedDataset)
989+
let mut group = c.benchmark_group("packed_to_binned");
990+
group.measurement_time(Duration::from_secs(5));
991+
group.sample_size(20);
992+
993+
for (num_rows, num_features) in [(10_000, 20), (100_000, 20), (100_000, 100)] {
994+
// Create a BinnedDataset with low-cardinality features (packable)
995+
let mut features = Vec::with_capacity(num_rows * num_features);
996+
for _ in 0..num_features {
997+
for r in 0..num_rows {
998+
features.push((r % 16) as u8);
999+
}
1000+
}
1001+
let targets: Vec<f32> = (0..num_rows).map(|i| i as f32).collect();
1002+
let feature_info: Vec<FeatureInfo> = (0..num_features)
1003+
.map(|f| FeatureInfo {
1004+
name: format!("f{}", f),
1005+
feature_type: FeatureType::Categorical,
1006+
num_bins: 16,
1007+
bin_boundaries: vec![],
1008+
})
1009+
.collect();
1010+
1011+
let binned = BinnedDataset::new(num_rows, features, targets, feature_info);
1012+
let packed = PackedDataset::from_binned(&binned);
1013+
1014+
eprintln!(
1015+
"PackedDataset {num_rows}x{num_features}: {:.1}% memory savings",
1016+
packed.memory_savings() * 100.0
1017+
);
1018+
1019+
group.bench_function(format!("to_binned_{}x{}", num_rows, num_features), |b| {
1020+
b.iter(|| {
1021+
black_box(packed.to_binned());
1022+
});
1023+
});
1024+
}
1025+
1026+
group.finish();
9591027
}
9601028

961-
criterion_group!(benches, benchmark_prediction_components, benchmark_histogram_building, benchmark_training_components, benchmark_efb, benchmark_split_finding, benchmark_4bit_unpacking);
1029+
criterion_group!(
1030+
benches,
1031+
benchmark_prediction_components,
1032+
benchmark_histogram_building,
1033+
benchmark_training_components,
1034+
benchmark_efb,
1035+
benchmark_split_finding,
1036+
benchmark_4bit_unpacking
1037+
);
9621038
criterion_main!(benches);
-995 KB
Binary file not shown.

0 commit comments

Comments
 (0)