Dividing xtensor by a scalar is x20-40 times slower than using std::transform

Dividing `xtensor` container by some integer scalar (`vOutput = vInput / 2`), gives super low performance compared to naive implementation with `std::transform`, while other operations like multiplication (`vOutput = vInput * 2`), maximum (`vOutput = xt::maximum(vInput1, vInput2)`) give similar performance to `std::transform`. We build with `xsimd` enabled.

Comparing speed of `xtensor` vs `std::transform` for different operations:
- `/2`: `xtensor` is x20-40 slower
- `/2.0`: `xtensor` is x2 slower
- `*2`: `xtensor` is 10% slower
- `max`: `xtensor` is same speed

Benchmarks:
```cpp
static void Xtensor_Uint16_2000x2000_DivideBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2;
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2.0; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2.0;
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue * 2; });
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput * 2;
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_StdTransform(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        auto vInput2It = vInput2.begin();
        std::transform(vInput1.begin(), vInput1.end(), vOutput.begin(), [&vInput2It](auto&& aInput1Value) { return std::max(aInput1Value, *vInput2It++); });
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_Xtensor(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        vOutput = xt::maximum(vInput1, vInput2);
    }
}
```

Results on ubuntu:
```
---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                                       114483 ns       114483 ns         6016
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                                           4295418 ns      4295440 ns          165
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                                 442543 ns       442541 ns         1596
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                                      821435 ns       821429 ns          837
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                                     115849 ns       115845 ns         5901
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                                          131595 ns       131594 ns         5328
Xtensor_Uint16_2000x2000_Maximum_StdTransform                                         204465 ns       203952 ns         3156
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                              198696 ns       198692 ns         3466
```

Results on windows:
```
---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                          764377 ns       767299 ns         1120
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                             14637306 ns     14687500 ns           50
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                   2954759 ns      2966054 ns          289
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                        5484534 ns      5503462 ns          451
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                        759787 ns       767299 ns          896
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                             888914 ns       889369 ns          896
Xtensor_Uint16_2000x2000_Maximum_StdTransform                            993174 ns       976562 ns          640
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                 985171 ns      1000977 ns          640
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Dividing xtensor by a scalar is x20-40 times slower than using std::transform #2849

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Dividing xtensor by a scalar is x20-40 times slower than using std::transform #2849

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions