Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion benchmarks/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ function(add_benchmark test_name test_src)
add_dependencies(benchmarks ${test_name})
endfunction()

add_benchmark(gptSessionBenchmark gptSessionBenchmark.cpp)
add_benchmark(bertBenchmark bertBenchmark.cpp)
add_benchmark(gptManagerBenchmark gptManagerBenchmark.cpp)
add_benchmark(disaggServerBenchmark disaggServerBenchmark.cpp)
27 changes: 0 additions & 27 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,36 +316,9 @@ For detailed usage, you can do the following
cd cpp/build

# You can directly execute the binary for help information
./benchmarks/gptSessionBenchmark --help
./benchmarks/bertBenchmark --help
```

Take GPT-350M as an example for single GPU

```
./benchmarks/gptSessionBenchmark \
--engine_dir "../../benchmarks/gpt_350m/" \
--batch_size "1" \
--input_output_len "60,20"

# Expected output:
# [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 40.81
```
Take GPT-175B as an example for multiple GPUs
```
mpirun -n 8 ./benchmarks/gptSessionBenchmark \
--engine_dir "../../benchmarks/gpt_175b/" \
--batch_size "1" \
--input_output_len "60,20"

# Expected output:
# [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 792.14
```

If you want to obtain context and generation logits, you could build an enigne with `--gather_context_logits` and `--gather_generation_logits`, respectively. Enable `--gather_all_token_logits` will enable both of them.

If you want to get the logits, you could run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.

*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*


Expand Down
22 changes: 4 additions & 18 deletions benchmarks/cpp/bertBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/memoryUtils.h"

#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/rawEngine.h"
Expand All @@ -23,36 +23,22 @@
#include "tensorrt_llm/runtime/worldConfig.h"

#include <NvInfer.h>
#include <chrono>
#include <cxxopts.hpp>
#include <nlohmann/json.hpp>

#include <chrono>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <nlohmann/json.hpp>
#include <sstream>
#include <string>

using namespace tensorrt_llm::runtime;

namespace tc = tensorrt_llm::common;
namespace trt = nvinfer1;

namespace
{
// follows https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/common/sampleEngines.cpp
std::vector<uint8_t> loadEngine(std::string const& enginePath)
{
std::ifstream engineFile(enginePath, std::ios::binary);
TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error opening engine file: " + enginePath));
engineFile.seekg(0, std::ifstream::end);
auto const size = engineFile.tellg();
engineFile.seekg(0, std::ifstream::beg);

std::vector<uint8_t> engineBlob(size);
engineFile.read(reinterpret_cast<char*>(engineBlob.data()), size);
TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error loading engine file: " + enginePath));
return engineBlob;
}

std::string engineFilename(
std::filesystem::path const& dataPath, WorldConfig const& worldConfig, std::string const& model)
Expand Down
1 change: 0 additions & 1 deletion benchmarks/cpp/disaggServerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "tensorrt_llm/executor/types.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/generationConfig.h"
#include "tensorrt_llm/runtime/gptJsonConfig.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
Expand Down
Loading