NVIDIA
diff --git a/‎.coderabbit.yaml
Lines changed: 0 additions & 1 deletion b/‎.coderabbit.yaml
Lines changed: 0 additions & 1 deletion
diff --git a/‎3rdparty/xgrammar b/‎3rdparty/xgrammar
diff --git a/‎cpp/tensorrt_llm/batch_manager/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
Lines changed: 48 additions & 14 deletions b/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
Lines changed: 48 additions & 14 deletions
diff --git a/‎cpp/tensorrt_llm/common/envUtils.cpp
Lines changed: 11 additions & 0 deletions b/‎cpp/tensorrt_llm/common/envUtils.cpp
Lines changed: 11 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/common/envUtils.h
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/common/envUtils.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/executor/executorImpl.cpp
Lines changed: 0 additions & 3 deletions b/‎cpp/tensorrt_llm/executor/executorImpl.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
Lines changed: 4 additions & 4 deletions b/‎docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/commands/trtllm-serve.rst
Lines changed: 5 additions & 6 deletions b/‎docs/source/commands/trtllm-serve.rst
Lines changed: 5 additions & 6 deletions
@@ -28,7 +28,6 @@ reviews:
   related_prs: true
   suggested_labels: true
   suggested_reviewers: true
-  auto_assign_reviewers: true
   poem: false
   auto_review:
     drafts: true
 
@@ -59,7 +59,7 @@ set(SRCS
 
 file(GLOB_RECURSE XGRAMMAR_SRCS "${3RDPARTY_DIR}/xgrammar/cpp/*.cc")
 list(FILTER XGRAMMAR_SRCS EXCLUDE REGEX
-     "${3RDPARTY_DIR}/xgrammar/cpp/pybind/.*\\.cc")
+     "${3RDPARTY_DIR}/xgrammar/cpp/nanobind/.*\\.cc")
 list(APPEND SRCS ${XGRAMMAR_SRCS})
 
 if(NOT WIN32)
 
@@ -18,8 +18,10 @@
 #include "tensorrt_llm/batch_manager/guidedDecoder.h"
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/logitsBitmask.h"
 
+#include <nlohmann/json.hpp>
 #include <xgrammar/xgrammar.h>
 
 using namespace tensorrt_llm::runtime;
@@ -41,20 +43,23 @@ GuidedDecoder::GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodin
     if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
     {
         mXGrammarMatchers.resize(mMaxNumSequences);
+        xgrammar::VocabType vocabType = xgrammar::VocabType::RAW;
+        bool addPrefixSpace = false;
         auto const& tokenizerStr = guidedDecodingConfig.getTokenizerStr();
         if (tokenizerStr)
         {
-            auto const& tokenizerInfo = xgrammar::TokenizerInfo::FromHuggingFace(
-                guidedDecodingConfig.getEncodedVocab().value(), guidedDecodingConfig.getTokenizerStr().value(),
-                mVocabSizePadded, guidedDecodingConfig.getStopTokenIds());
-            mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo);
-        }
-        else
-        {
-            auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(),
-                xgrammar::VocabType::RAW, mVocabSizePadded, guidedDecodingConfig.getStopTokenIds());
-            mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo);
+            auto const& metadata = xgrammar::TokenizerInfo::DetectMetadataFromHF(tokenizerStr.value());
+            auto const& metadataJson = nlohmann::json::parse(metadata);
+            vocabType = metadataJson.at("vocab_type").template get<xgrammar::VocabType>();
+            addPrefixSpace = metadataJson.at("add_prefix_space").template get<bool>();
         }
+        auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(), vocabType,
+            mVocabSizePadded, guidedDecodingConfig.getStopTokenIds(), addPrefixSpace);
+
+        auto const cacheLimitGb = common::getFloatEnv("XGRAMMAR_CACHE_LIMIT_GB");
+        mXGrammarCompiler = std::make_shared<xgrammar::GrammarCompiler>(tokenizerInfo, /*max_threads=*/8,
+            /*cache_enabled=*/true,
+            /*cache_limit_bytes=*/static_cast<long long>(cacheLimitGb.value_or(1.0f) * 1024 * 1024 * 1024));
 
         auto const logitsPtrDtype = BufferDataType{mLogitsDtype, false, true};
         auto constexpr bitmaskDtype = TRTDataType<BitmaskT>::value;
@@ -89,27 +94,56 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
                     // The request is in the first context forward step (considering kv cache reuse).
                     auto const& guideType = guidedDecodingParams->getGuideType();
                     auto const& guide = guidedDecodingParams->getGuide();
-                    if (guideType == executor::GuidedDecodingParams::GuideType::kJSON)
+                    switch (guideType)
+                    {
+                    case executor::GuidedDecodingParams::GuideType::kJSON:
                     {
                         mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
                             mXGrammarCompiler->CompileBuiltinJSONGrammar());
+                        break;
                     }
-                    else if (guideType == executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA)
+                    case executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA:
                     {
                         mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
                             mXGrammarCompiler->CompileJSONSchema(guide.value()));
+                        break;
                     }
-                    else if (guideType == executor::GuidedDecodingParams::GuideType::kREGEX)
+                    case executor::GuidedDecodingParams::GuideType::kREGEX:
                     {
                         auto const& grammar = xgrammar::Grammar::FromRegex(guide.value());
                         mXGrammarMatchers.at(seqSlot)
                             = std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
+                        break;
                     }
-                    else if (guideType == executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR)
+                    case executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR:
                     {
                         auto const& grammar = xgrammar::Grammar::FromEBNF(guide.value());
                         mXGrammarMatchers.at(seqSlot)
                             = std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
+                        break;
+                    }
+                    case executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG:
+                    {
+                        auto const& structuralTagParametersJson = nlohmann::json::parse(guide.value());
+                        auto const& structuralTagItemsJson
+                            = structuralTagParametersJson.at("structures").template get<std::vector<nlohmann::json>>();
+                        std::vector<xgrammar::StructuralTagItem> structuralTagItems;
+                        for (auto const& s : structuralTagItemsJson)
+                        {
+                            structuralTagItems.emplace_back(
+                                xgrammar::StructuralTagItem{s.at("begin").template get<std::string>(),
+                                    s.at("schema").dump(), s.at("end").template get<std::string>()});
+                        }
+                        auto const& triggers
+                            = structuralTagParametersJson.at("triggers").template get<std::vector<std::string>>();
+                        mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
+                            mXGrammarCompiler->CompileStructuralTag(structuralTagItems, triggers));
+                        break;
+                    }
+                    default:
+                    {
+                        TLLM_THROW("Unsupported guide type.");
+                    }
                     }
                 }
                 else if (llmReq->isGenerationInProgressState())
 
@@ -50,6 +50,17 @@ std::optional<size_t> getUInt64Env(char const* name)
     return {val};
 };
 
+std::optional<float> getFloatEnv(char const* name)
+{
+    char const* const env = std::getenv(name);
+    if (env == nullptr)
+    {
+        return std::nullopt;
+    }
+    float const val = std::stof(env);
+    return {val};
+}
+
 std::optional<std::string> getStrEnv(char const* name)
 {
     char const* const env = std::getenv(name);
 
@@ -27,6 +27,8 @@ std::optional<int32_t> getIntEnv(char const* name);
 
 std::optional<size_t> getUInt64Env(char const* name);
 
+std::optional<float> getFloatEnv(char const* name);
+
 bool getBoolEnv(char const* name);
 
 // XQA kernels (optimized kernels for generation phase).
 
@@ -1621,9 +1621,6 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests
                     TLLM_CHECK_WITH_INFO(mModel->hasGuidedDecoder(),
                         "Request is specified with GuidedDecodingParams, but GuidedDecoder is not setup. Please "
                         "provide a valid GuidedDecodingConfig to setup GuidedDecoder.");
-                    TLLM_CHECK_WITH_INFO(newReq->getGuidedDecodingParams()->getGuideType()
-                            != executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG,
-                        "Structural tag is not supported for guided decoding in C++ Executor.");
                 }
 
                 if (mModel->getWorldConfig().isLastPipelineParallelRank() && newReq->hasAdditionalOutputs())
 
@@ -76,12 +76,12 @@ In the example below, two context servers are launched on ports 8001 and 8002, a
 
 ```shell
 # Launching context servers
-trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx0 &
-trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx1 &
+trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 &> output_ctx0 &
+trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 &> output_ctx1 &
 
 # Launching generation servers
-trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen0 &
-trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8004 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen1 &
+trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --kv_cache_free_gpu_memory_fraction 0.15 &> output_gen0 &
+trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8004 --kv_cache_free_gpu_memory_fraction 0.15 &> output_gen1 &
 
 # Launching disaggregated server
 trtllm-serve disaggregated -c disagg_config.yaml
 
@@ -72,7 +72,7 @@ docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
         TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
         trtllm-serve /config/models/maverick \
             --host 0.0.0.0 --port 8000 \
-            --backend pytorch --tp_size 8 --ep_size 1 \
+            --tp_size 8 --ep_size 1 \
             --trust_remote_code --extra_llm_api_options c.yaml \
             --kv_cache_free_gpu_memory_fraction 0.75"
 ```
 
@@ -27,7 +27,7 @@ The following abbreviated command syntax shows the commonly used arguments to st
 
 .. code-block:: bash
 
-   trtllm-serve <model> [--backend pytorch --tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
+   trtllm-serve <model> [--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
 
 For the full syntax and argument descriptions, refer to :ref:`syntax`.
 
@@ -90,8 +90,7 @@ Then, start the server with the configuration file:
 .. code-block:: bash
 
    trtllm-serve Qwen/Qwen2-VL-7B-Instruct \
-       --extra_llm_api_options ./extra-llm-api-config.yml \
-       --backend pytorch
+       --extra_llm_api_options ./extra-llm-api-config.yml
 
 Multimodal Chat API
 ~~~~~~~~~~~~~~~~~~~
@@ -213,7 +212,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
         --container-image=<CONTAINER_IMG> \
         --container-mounts=/workspace:/workspace \
         --container-workdir /workspace \
-        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
 
 See `the source code <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/trtllm-llmapi-launch>`_ of ``trtllm-llmapi-launch`` for more details.
 
@@ -232,7 +231,7 @@ Metrics Endpoint
 
 The ``/metrics`` endpoint provides runtime-iteration statistics such as GPU memory use and inflight-batching details.
 For the TensorRT backend, these statistics are enabled by default.
-However, for the PyTorch backend, specified with the ``--backend pytorch`` argument, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
+However, for the PyTorch backend, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
 
 .. code-block:: yaml
 
@@ -246,7 +245,7 @@ Then start the server and specify the ``--extra_llm_api_options`` argument with
 
    trtllm-serve <model> \
      --extra_llm_api_options <path-to-extra-llm-api-config.yml> \
-     [--backend pytorch --tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
+     [--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
 
 After at least one inference request is sent to the server, you can fetch the runtime-iteration statistics by polling the `/metrics` endpoint:
Original file line number	Diff line number	Diff line change
`@@ -1621,9 +1621,6 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests`
`1621`	`1621`	`TLLM_CHECK_WITH_INFO(mModel->hasGuidedDecoder(),`
`1622`	`1622`	`"Request is specified with GuidedDecodingParams, but GuidedDecoder is not setup. Please "`
`1623`	`1623`	`"provide a valid GuidedDecodingConfig to setup GuidedDecoder.");`
`1624`		`- TLLM_CHECK_WITH_INFO(newReq->getGuidedDecodingParams()->getGuideType()`
`1625`		`- != executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG,`
`1626`		`- "Structural tag is not supported for guided decoding in C++ Executor.");`
`1627`	`1624`	`}`
`1628`	`1625`
`1629`	`1626`	`if (mModel->getWorldConfig().isLastPipelineParallelRank() && newReq->hasAdditionalOutputs())`