Skip to content

Dividing xtensor by a scalar is x20-40 times slower than using std::transform #2849

@vakokako

Description

@vakokako

Dividing xtensor container by some integer scalar (vOutput = vInput / 2), gives super low performance compared to naive implementation with std::transform, while other operations like multiplication (vOutput = vInput * 2), maximum (vOutput = xt::maximum(vInput1, vInput2)) give similar performance to std::transform. We build with xsimd enabled.

Comparing speed of xtensor vs std::transform for different operations:

  • /2: xtensor is x20-40 slower
  • /2.0: xtensor is x2 slower
  • *2: xtensor is 10% slower
  • max: xtensor is same speed

Benchmarks:

static void Xtensor_Uint16_2000x2000_DivideBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2;
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2.0; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2.0;
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue * 2; });
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput * 2;
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_StdTransform(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        auto vInput2It = vInput2.begin();
        std::transform(vInput1.begin(), vInput1.end(), vOutput.begin(), [&vInput2It](auto&& aInput1Value) { return std::max(aInput1Value, *vInput2It++); });
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_Xtensor(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        vOutput = xt::maximum(vInput1, vInput2);
    }
}

Results on ubuntu:

---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                                       114483 ns       114483 ns         6016
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                                           4295418 ns      4295440 ns          165
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                                 442543 ns       442541 ns         1596
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                                      821435 ns       821429 ns          837
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                                     115849 ns       115845 ns         5901
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                                          131595 ns       131594 ns         5328
Xtensor_Uint16_2000x2000_Maximum_StdTransform                                         204465 ns       203952 ns         3156
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                              198696 ns       198692 ns         3466

Results on windows:

---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                          764377 ns       767299 ns         1120
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                             14637306 ns     14687500 ns           50
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                   2954759 ns      2966054 ns          289
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                        5484534 ns      5503462 ns          451
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                        759787 ns       767299 ns          896
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                             888914 ns       889369 ns          896
Xtensor_Uint16_2000x2000_Maximum_StdTransform                            993174 ns       976562 ns          640
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                 985171 ns      1000977 ns          640

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions