NVIDIA · Funatiq · May 14, 2025 · May 6, 2025 · May 6, 2025 · May 6, 2025
diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
@@ -37,7 +37,6 @@ function(add_benchmark test_name test_src)
   add_dependencies(benchmarks ${test_name})
 endfunction()
 
-add_benchmark(gptSessionBenchmark gptSessionBenchmark.cpp)
 add_benchmark(bertBenchmark bertBenchmark.cpp)
 add_benchmark(gptManagerBenchmark gptManagerBenchmark.cpp)
 add_benchmark(disaggServerBenchmark disaggServerBenchmark.cpp)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -316,36 +316,9 @@ For detailed usage, you can do the following
 cd cpp/build
 
 # You can directly execute the binary for help information
-./benchmarks/gptSessionBenchmark --help
 ./benchmarks/bertBenchmark --help
 ```
 
-Take GPT-350M as an example for single GPU
-
-```
-./benchmarks/gptSessionBenchmark \
-    --engine_dir "../../benchmarks/gpt_350m/" \
-    --batch_size "1" \
-    --input_output_len "60,20"
-
-# Expected output:
-# [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 40.81
-```
-Take GPT-175B as an example for multiple GPUs
-```
-mpirun -n 8 ./benchmarks/gptSessionBenchmark \
-    --engine_dir "../../benchmarks/gpt_175b/" \
-    --batch_size "1" \
-    --input_output_len "60,20"
-
-# Expected output:
-# [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 792.14
-```
-
-If you want to obtain context and generation logits, you could build an enigne with `--gather_context_logits` and `--gather_generation_logits`, respectively. Enable `--gather_all_token_logits` will enable both of them.
-
-If you want to get the logits, you could run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
-
 *Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
 
 

diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/common/memoryUtils.h"
+
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/rawEngine.h"
@@ -23,36 +23,22 @@
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <NvInfer.h>
-#include <chrono>
 #include <cxxopts.hpp>
+#include <nlohmann/json.hpp>
+
+#include <chrono>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <nlohmann/json.hpp>
 #include <sstream>
 #include <string>
 
 using namespace tensorrt_llm::runtime;
 
-namespace tc = tensorrt_llm::common;
 namespace trt = nvinfer1;
 
 namespace
 {
-// follows https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/common/sampleEngines.cpp
-std::vector<uint8_t> loadEngine(std::string const& enginePath)
-{
-    std::ifstream engineFile(enginePath, std::ios::binary);
-    TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error opening engine file: " + enginePath));
-    engineFile.seekg(0, std::ifstream::end);
-    auto const size = engineFile.tellg();
-    engineFile.seekg(0, std::ifstream::beg);
-
-    std::vector<uint8_t> engineBlob(size);
-    engineFile.read(reinterpret_cast<char*>(engineBlob.data()), size);
-    TLLM_CHECK_WITH_INFO(engineFile.good(), std::string("Error loading engine file: " + enginePath));
-    return engineBlob;
-}
 
 std::string engineFilename(
     std::filesystem::path const& dataPath, WorldConfig const& worldConfig, std::string const& model)

diff --git a/benchmarks/cpp/disaggServerBenchmark.cpp b/benchmarks/cpp/disaggServerBenchmark.cpp
@@ -22,7 +22,6 @@
 #include "tensorrt_llm/executor/types.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/generationConfig.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"