-
Notifications
You must be signed in to change notification settings - Fork 421
Open
Description
Dividing xtensor
container by some integer scalar (vOutput = vInput / 2
), gives super low performance compared to naive implementation with std::transform
, while other operations like multiplication (vOutput = vInput * 2
), maximum (vOutput = xt::maximum(vInput1, vInput2)
) give similar performance to std::transform
. We build with xsimd
enabled.
Comparing speed of xtensor
vs std::transform
for different operations:
/2
:xtensor
is x20-40 slower/2.0
:xtensor
is x2 slower*2
:xtensor
is 10% slowermax
:xtensor
is same speed
Benchmarks:
static void Xtensor_Uint16_2000x2000_DivideBy2_StdTransform(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2; });
}
}
static void Xtensor_Uint16_2000x2000_DivideBy2_Xtensor(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
vOutput = vInput / 2;
}
}
static void Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2.0; });
}
}
static void Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
vOutput = vInput / 2.0;
}
}
static void Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue * 2; });
}
}
static void Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor(benchmark::State& aState) {
auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput);
for (auto _ : aState) {
vOutput = vInput * 2;
}
}
static void Xtensor_Uint16_2000x2000_Maximum_StdTransform(benchmark::State& aState) {
auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput1);
generateRandomInt16From0To100(vInput2);
for (auto _ : aState) {
auto vInput2It = vInput2.begin();
std::transform(vInput1.begin(), vInput1.end(), vOutput.begin(), [&vInput2It](auto&& aInput1Value) { return std::max(aInput1Value, *vInput2It++); });
}
}
static void Xtensor_Uint16_2000x2000_Maximum_Xtensor(benchmark::State& aState) {
auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
generateRandomInt16From0To100(vInput1);
generateRandomInt16From0To100(vInput2);
for (auto _ : aState) {
vOutput = xt::maximum(vInput1, vInput2);
}
}
Results on ubuntu:
---------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform 114483 ns 114483 ns 6016
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor 4295418 ns 4295440 ns 165
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform 442543 ns 442541 ns 1596
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor 821435 ns 821429 ns 837
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform 115849 ns 115845 ns 5901
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor 131595 ns 131594 ns 5328
Xtensor_Uint16_2000x2000_Maximum_StdTransform 204465 ns 203952 ns 3156
Xtensor_Uint16_2000x2000_Maximum_Xtensor 198696 ns 198692 ns 3466
Results on windows:
---------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform 764377 ns 767299 ns 1120
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor 14637306 ns 14687500 ns 50
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform 2954759 ns 2966054 ns 289
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor 5484534 ns 5503462 ns 451
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform 759787 ns 767299 ns 896
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor 888914 ns 889369 ns 896
Xtensor_Uint16_2000x2000_Maximum_StdTransform 993174 ns 976562 ns 640
Xtensor_Uint16_2000x2000_Maximum_Xtensor 985171 ns 1000977 ns 640
emmenlauemmenlau
Metadata
Metadata
Assignees
Labels
No labels