NVIDIA
diff --git a/‎.github/workflows/deploy_nightly_docs.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/deploy_nightly_docs.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trigger-ci.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/trigger-ci.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.rst‎
Lines changed: 9 additions & 1 deletion b/‎README.rst‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎benchmarks/attention/benchmark_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/attention/benchmark_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_tools/VERSION.txt‎
Lines changed: 1 addition & 1 deletion b/‎build_tools/VERSION.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_tools/build_ext.py‎
Lines changed: 6 additions & 2 deletions b/‎build_tools/build_ext.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎build_tools/pytorch.py‎
Lines changed: 13 additions & 8 deletions b/‎build_tools/pytorch.py‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎build_tools/utils.py‎
Lines changed: 6 additions & 1 deletion b/‎build_tools/utils.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎docs/_templates/layout.html‎
Lines changed: 12 additions & 3 deletions b/‎docs/_templates/layout.html‎
Lines changed: 12 additions & 3 deletions
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4.1.7
         with:
             name: "te_docs"
             path: "html"
 
@@ -31,7 +31,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install pip -y
-          pip install torch
+          pip install torch numpy
           export PYTHON_ONLY=1
           export TE_PATH=.
           bash ./qa/L0_pytorch_lint/test.sh
 
@@ -33,6 +33,9 @@ jobs:
            || github.actor == 'Oleg-Goncharov'
            || github.actor == 'phu0ngng'
            || github.actor == 'xrennvidia'
+           || github.actor == 'yaox12'
+           || github.actor == 'huanghua1994'
+           || github.actor == 'mgoldfarb-nvidia'
          )
     steps:
       - name: Check if comment is issued by authorized person
 
@@ -174,7 +174,15 @@ To install the latest stable version of Transformer Engine,
 
     pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
 
-This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch).
+This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch,paddle).
+
+Alternatively, the package can be directly installed from `Transformer Engine's PyPI <https://pypi.org/project/transformer-engine/>`_, e.g.
+
+.. code-block:: bash
+
+    pip install transformer_engine[pytorch]
+
+To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch,paddle]). Transformer Engine ships wheels for the core library as well as the PaddlePaddle extensions. Source distributions are shipped for the JAX and PyTorch extensions.
 
 From source
 ^^^^^^^^^^^
 
@@ -156,7 +156,7 @@ def parse_results(per_cudnn, per_flash, model):
         df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] = t_cudnn_avg.sum() / 1e6
 
     if per_flash > 0:
-        t_flash_all = df[df["Name"].str.contains("void flash")]["Duration (ns)"].to_numpy()
+        t_flash_all = df[df["Name"].str.contains("flash")]["Duration (ns)"].to_numpy()
         t_flash_all = t_flash_all.reshape(-1, per_flash)
         t_flash_avg = np.average(t_flash_all, axis=0)
         df_times.loc[row, "FlashAttention Kernels (fwd)"] = t_flash_avg[0] / 1e6
 
@@ -1 +1 @@
-1.10.0
+1.11.0
@@ -106,8 +106,12 @@ def run(self) -> None:
                 if isinstance(ext, CMakeExtension):
                     print(f"Building CMake extension {ext.name}")
                     # Set up incremental builds for CMake extensions
-                    setup_dir = Path(__file__).resolve().parent.parent
-                    build_dir = setup_dir / "build" / "cmake"
+                    build_dir = os.getenv("NVTE_CMAKE_BUILD_DIR")
+                    if build_dir:
+                        build_dir = Path(build_dir).resolve()
+                    else:
+                        root_dir = Path(__file__).resolve().parent.parent
+                        build_dir = root_dir / "build" / "cmake"
 
                     # Ensure the directory exists
                     build_dir.mkdir(parents=True, exist_ok=True)
 
@@ -10,8 +10,9 @@
 
 from .utils import (
     all_files_in_dir,
-    cuda_version,
+    cuda_archs,
     cuda_path,
+    cuda_version,
 )
 
 
@@ -48,8 +49,6 @@ def setup_pytorch_extension(
     ]
     nvcc_flags = [
         "-O3",
-        "-gencode",
-        "arch=compute_70,code=sm_70",
         "-U__CUDA_NO_HALF_OPERATORS__",
         "-U__CUDA_NO_HALF_CONVERSIONS__",
         "-U__CUDA_NO_BFLOAT16_OPERATORS__",
@@ -61,6 +60,11 @@ def setup_pytorch_extension(
         "--use_fast_math",
     ]
 
+    cuda_architectures = cuda_archs()
+
+    if "70" in cuda_architectures:
+        nvcc_flags.extend(["-gencode", "arch=compute_70,code=sm_70"])
+
     # Version-dependent CUDA options
     try:
         version = cuda_version()
@@ -73,17 +77,18 @@ def setup_pytorch_extension(
             (
                 "--threads",
                 os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
-                "-gencode",
-                "arch=compute_80,code=sm_80",
-                "-gencode",
-                "arch=compute_90,code=sm_90",
             )
         )
 
+        if "80" in cuda_architectures:
+            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
+        if "90" in cuda_architectures:
+            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+
     # Libraries
     library_dirs = []
     libraries = []
-    if os.getenv("NVTE_UB_WITH_MPI"):
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", 0))):
         assert (
             os.getenv("MPI_HOME") is not None
         ), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"
 
@@ -6,12 +6,12 @@
 
 import functools
 import glob
+import importlib
 import os
 import re
 import shutil
 import subprocess
 import sys
-import importlib
 from pathlib import Path
 from subprocess import CalledProcessError
 from typing import List, Optional, Tuple, Union
@@ -188,6 +188,11 @@ def cuda_path() -> Tuple[str, str]:
     return cuda_home, nvcc_bin
 
 
+@functools.lru_cache(maxsize=None)
+def cuda_archs() -> str:
+    return os.getenv("NVTE_CUDA_ARCHS", "70;80;89;90")
+
+
 def cuda_version() -> Tuple[int, ...]:
     """CUDA Toolkit version as a (major, minor) tuple."""
     # Query NVCC for version info
 
@@ -1,4 +1,11 @@
 {% extends "!layout.html" %}
+
+  {% block extrahead %}
+
+  <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
+
+  {% endblock %}
+
   {% block sidebartitle %} {{ super() }}
 
   <style>
@@ -83,8 +90,10 @@
   }
   </style>
 
-  {%- if nvidia_analytics_id %}
-  <script type="text/javascript">_satellite.pageBottom();</script>
-  {%- endif %}
+  {% endblock %}
+
+  {% block footer %}
+
+  <script type="text/javascript">if (typeof _satellite !== “undefined”){ _satellite.pageBottom();}</script>
 
   {% endblock %}