Skip to content

Commit d9b5deb

Browse files
authored
Merge branch 'main' into enable-ngram
Signed-off-by: Mike Iovine <miovine@nvidia.com>
2 parents 3964da4 + baece56 commit d9b5deb

File tree

64 files changed

+763
-285
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+763
-285
lines changed

.coderabbit.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ reviews:
2828
related_prs: true
2929
suggested_labels: true
3030
suggested_reviewers: true
31-
auto_assign_reviewers: true
3231
poem: false
3332
auto_review:
3433
drafts: true

3rdparty/xgrammar

Submodule xgrammar updated 173 files

cpp/tensorrt_llm/batch_manager/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ set(SRCS
5959

6060
file(GLOB_RECURSE XGRAMMAR_SRCS "${3RDPARTY_DIR}/xgrammar/cpp/*.cc")
6161
list(FILTER XGRAMMAR_SRCS EXCLUDE REGEX
62-
"${3RDPARTY_DIR}/xgrammar/cpp/pybind/.*\\.cc")
62+
"${3RDPARTY_DIR}/xgrammar/cpp/nanobind/.*\\.cc")
6363
list(APPEND SRCS ${XGRAMMAR_SRCS})
6464

6565
if(NOT WIN32)

cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
#include "tensorrt_llm/batch_manager/guidedDecoder.h"
1919
#include "tensorrt_llm/batch_manager/decoderBuffers.h"
2020
#include "tensorrt_llm/batch_manager/llmRequest.h"
21+
#include "tensorrt_llm/common/envUtils.h"
2122
#include "tensorrt_llm/kernels/logitsBitmask.h"
2223

24+
#include <nlohmann/json.hpp>
2325
#include <xgrammar/xgrammar.h>
2426

2527
using namespace tensorrt_llm::runtime;
@@ -41,20 +43,23 @@ GuidedDecoder::GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodin
4143
if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
4244
{
4345
mXGrammarMatchers.resize(mMaxNumSequences);
46+
xgrammar::VocabType vocabType = xgrammar::VocabType::RAW;
47+
bool addPrefixSpace = false;
4448
auto const& tokenizerStr = guidedDecodingConfig.getTokenizerStr();
4549
if (tokenizerStr)
4650
{
47-
auto const& tokenizerInfo = xgrammar::TokenizerInfo::FromHuggingFace(
48-
guidedDecodingConfig.getEncodedVocab().value(), guidedDecodingConfig.getTokenizerStr().value(),
49-
mVocabSizePadded, guidedDecodingConfig.getStopTokenIds());
50-
mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo);
51-
}
52-
else
53-
{
54-
auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(),
55-
xgrammar::VocabType::RAW, mVocabSizePadded, guidedDecodingConfig.getStopTokenIds());
56-
mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo);
51+
auto const& metadata = xgrammar::TokenizerInfo::DetectMetadataFromHF(tokenizerStr.value());
52+
auto const& metadataJson = nlohmann::json::parse(metadata);
53+
vocabType = metadataJson.at("vocab_type").template get<xgrammar::VocabType>();
54+
addPrefixSpace = metadataJson.at("add_prefix_space").template get<bool>();
5755
}
56+
auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(), vocabType,
57+
mVocabSizePadded, guidedDecodingConfig.getStopTokenIds(), addPrefixSpace);
58+
59+
auto const cacheLimitGb = common::getFloatEnv("XGRAMMAR_CACHE_LIMIT_GB");
60+
mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo, /*max_threads=*/8,
61+
/*cache_enabled=*/true,
62+
/*cache_limit_bytes=*/static_cast<long long>(cacheLimitGb.value_or(1.0f) * 1024 * 1024 * 1024));
5863

5964
auto const logitsPtrDtype = BufferDataType{mLogitsDtype, false, true};
6065
auto constexpr bitmaskDtype = TRTDataType<BitmaskT>::value;
@@ -89,27 +94,56 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
8994
// The request is in the first context forward step (considering kv cache reuse).
9095
auto const& guideType = guidedDecodingParams->getGuideType();
9196
auto const& guide = guidedDecodingParams->getGuide();
92-
if (guideType == executor::GuidedDecodingParams::GuideType::kJSON)
97+
switch (guideType)
98+
{
99+
case executor::GuidedDecodingParams::GuideType::kJSON:
93100
{
94101
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
95102
mXGrammarCompiler->CompileBuiltinJSONGrammar());
103+
break;
96104
}
97-
else if (guideType == executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA)
105+
case executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA:
98106
{
99107
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
100108
mXGrammarCompiler->CompileJSONSchema(guide.value()));
109+
break;
101110
}
102-
else if (guideType == executor::GuidedDecodingParams::GuideType::kREGEX)
111+
case executor::GuidedDecodingParams::GuideType::kREGEX:
103112
{
104113
auto const& grammar = xgrammar::Grammar::FromRegex(guide.value());
105114
mXGrammarMatchers.at(seqSlot)
106115
= std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
116+
break;
107117
}
108-
else if (guideType == executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR)
118+
case executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR:
109119
{
110120
auto const& grammar = xgrammar::Grammar::FromEBNF(guide.value());
111121
mXGrammarMatchers.at(seqSlot)
112122
= std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
123+
break;
124+
}
125+
case executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG:
126+
{
127+
auto const& structuralTagParametersJson = nlohmann::json::parse(guide.value());
128+
auto const& structuralTagItemsJson
129+
= structuralTagParametersJson.at("structures").template get<std::vector<nlohmann::json>>();
130+
std::vector<xgrammar::StructuralTagItem> structuralTagItems;
131+
for (auto const& s : structuralTagItemsJson)
132+
{
133+
structuralTagItems.emplace_back(
134+
xgrammar::StructuralTagItem{s.at("begin").template get<std::string>(),
135+
s.at("schema").dump(), s.at("end").template get<std::string>()});
136+
}
137+
auto const& triggers
138+
= structuralTagParametersJson.at("triggers").template get<std::vector<std::string>>();
139+
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
140+
mXGrammarCompiler->CompileStructuralTag(structuralTagItems, triggers));
141+
break;
142+
}
143+
default:
144+
{
145+
TLLM_THROW("Unsupported guide type.");
146+
}
113147
}
114148
}
115149
else if (llmReq->isGenerationInProgressState())

cpp/tensorrt_llm/common/envUtils.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ std::optional<size_t> getUInt64Env(char const* name)
5050
return {val};
5151
};
5252

53+
std::optional<float> getFloatEnv(char const* name)
54+
{
55+
char const* const env = std::getenv(name);
56+
if (env == nullptr)
57+
{
58+
return std::nullopt;
59+
}
60+
float const val = std::stof(env);
61+
return {val};
62+
}
63+
5364
std::optional<std::string> getStrEnv(char const* name)
5465
{
5566
char const* const env = std::getenv(name);

cpp/tensorrt_llm/common/envUtils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ std::optional<int32_t> getIntEnv(char const* name);
2727

2828
std::optional<size_t> getUInt64Env(char const* name);
2929

30+
std::optional<float> getFloatEnv(char const* name);
31+
3032
bool getBoolEnv(char const* name);
3133

3234
// XQA kernels (optimized kernels for generation phase).

cpp/tensorrt_llm/executor/executorImpl.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,9 +1621,6 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests
16211621
TLLM_CHECK_WITH_INFO(mModel->hasGuidedDecoder(),
16221622
"Request is specified with GuidedDecodingParams, but GuidedDecoder is not setup. Please "
16231623
"provide a valid GuidedDecodingConfig to setup GuidedDecoder.");
1624-
TLLM_CHECK_WITH_INFO(newReq->getGuidedDecodingParams()->getGuideType()
1625-
!= executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG,
1626-
"Structural tag is not supported for guided decoding in C++ Executor.");
16271624
}
16281625

16291626
if (mModel->getWorldConfig().isLastPipelineParallelRank() && newReq->hasAdditionalOutputs())

docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,12 @@ In the example below, two context servers are launched on ports 8001 and 8002, a
7676

7777
```shell
7878
# Launching context servers
79-
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx0 &
80-
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx1 &
79+
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 &> output_ctx0 &
80+
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 &> output_ctx1 &
8181

8282
# Launching generation servers
83-
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen0 &
84-
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8004 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen1 &
83+
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --kv_cache_free_gpu_memory_fraction 0.15 &> output_gen0 &
84+
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8004 --kv_cache_free_gpu_memory_fraction 0.15 &> output_gen1 &
8585

8686
# Launching disaggregated server
8787
trtllm-serve disaggregated -c disagg_config.yaml

docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
7272
TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
7373
trtllm-serve /config/models/maverick \
7474
--host 0.0.0.0 --port 8000 \
75-
--backend pytorch --tp_size 8 --ep_size 1 \
75+
--tp_size 8 --ep_size 1 \
7676
--trust_remote_code --extra_llm_api_options c.yaml \
7777
--kv_cache_free_gpu_memory_fraction 0.75"
7878
```

docs/source/commands/trtllm-serve.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ The following abbreviated command syntax shows the commonly used arguments to st
2727

2828
.. code-block:: bash
2929
30-
trtllm-serve <model> [--backend pytorch --tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
30+
trtllm-serve <model> [--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
3131
3232
For the full syntax and argument descriptions, refer to :ref:`syntax`.
3333

@@ -90,8 +90,7 @@ Then, start the server with the configuration file:
9090
.. code-block:: bash
9191
9292
trtllm-serve Qwen/Qwen2-VL-7B-Instruct \
93-
--extra_llm_api_options ./extra-llm-api-config.yml \
94-
--backend pytorch
93+
--extra_llm_api_options ./extra-llm-api-config.yml
9594
9695
Multimodal Chat API
9796
~~~~~~~~~~~~~~~~~~~
@@ -213,7 +212,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
213212
--container-image=<CONTAINER_IMG> \
214213
--container-mounts=/workspace:/workspace \
215214
--container-workdir /workspace \
216-
bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
215+
bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
217216
218217
See `the source code <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/trtllm-llmapi-launch>`_ of ``trtllm-llmapi-launch`` for more details.
219218
@@ -232,7 +231,7 @@ Metrics Endpoint
232231
233232
The ``/metrics`` endpoint provides runtime-iteration statistics such as GPU memory use and inflight-batching details.
234233
For the TensorRT backend, these statistics are enabled by default.
235-
However, for the PyTorch backend, specified with the ``--backend pytorch`` argument, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
234+
However, for the PyTorch backend, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
236235
237236
.. code-block:: yaml
238237
@@ -246,7 +245,7 @@ Then start the server and specify the ``--extra_llm_api_options`` argument with
246245
247246
trtllm-serve <model> \
248247
--extra_llm_api_options <path-to-extra-llm-api-config.yml> \
249-
[--backend pytorch --tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
248+
[--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
250249
251250
After at least one inference request is sent to the server, you can fetch the runtime-iteration statistics by polling the `/metrics` endpoint:
252251

0 commit comments

Comments
 (0)