Skip to content

Commit c27ee60

Browse files
committed
Release v1.11
2 parents 08a85d3 + 4df8488 commit c27ee60

File tree

118 files changed

+8107
-3165
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+8107
-3165
lines changed

.github/workflows/deploy_nightly_docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
runs-on: ubuntu-latest
1717
steps:
1818
- name: Download artifact
19-
uses: actions/download-artifact@v3
19+
uses: actions/download-artifact@v4.1.7
2020
with:
2121
name: "te_docs"
2222
path: "html"

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
run: |
3232
sudo apt-get update
3333
sudo apt-get install pip -y
34-
pip install torch
34+
pip install torch numpy
3535
export PYTHON_ONLY=1
3636
export TE_PATH=.
3737
bash ./qa/L0_pytorch_lint/test.sh

.github/workflows/trigger-ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ jobs:
3333
|| github.actor == 'Oleg-Goncharov'
3434
|| github.actor == 'phu0ngng'
3535
|| github.actor == 'xrennvidia'
36+
|| github.actor == 'yaox12'
37+
|| github.actor == 'huanghua1994'
38+
|| github.actor == 'mgoldfarb-nvidia'
3639
)
3740
steps:
3841
- name: Check if comment is issued by authorized person

README.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,15 @@ To install the latest stable version of Transformer Engine,
174174
175175
pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
176176
177-
This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch).
177+
This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch,paddle).
178+
179+
Alternatively, the package can be directly installed from `Transformer Engine's PyPI <https://pypi.org/project/transformer-engine/>`_, e.g.
180+
181+
.. code-block:: bash
182+
183+
pip install transformer_engine[pytorch]
184+
185+
To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch,paddle]). Transformer Engine ships wheels for the core library as well as the PaddlePaddle extensions. Source distributions are shipped for the JAX and PyTorch extensions.
178186

179187
From source
180188
^^^^^^^^^^^

benchmarks/attention/benchmark_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def parse_results(per_cudnn, per_flash, model):
156156
df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] = t_cudnn_avg.sum() / 1e6
157157

158158
if per_flash > 0:
159-
t_flash_all = df[df["Name"].str.contains("void flash")]["Duration (ns)"].to_numpy()
159+
t_flash_all = df[df["Name"].str.contains("flash")]["Duration (ns)"].to_numpy()
160160
t_flash_all = t_flash_all.reshape(-1, per_flash)
161161
t_flash_avg = np.average(t_flash_all, axis=0)
162162
df_times.loc[row, "FlashAttention Kernels (fwd)"] = t_flash_avg[0] / 1e6

build_tools/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.10.0
1+
1.11.0

build_tools/build_ext.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,12 @@ def run(self) -> None:
106106
if isinstance(ext, CMakeExtension):
107107
print(f"Building CMake extension {ext.name}")
108108
# Set up incremental builds for CMake extensions
109-
setup_dir = Path(__file__).resolve().parent.parent
110-
build_dir = setup_dir / "build" / "cmake"
109+
build_dir = os.getenv("NVTE_CMAKE_BUILD_DIR")
110+
if build_dir:
111+
build_dir = Path(build_dir).resolve()
112+
else:
113+
root_dir = Path(__file__).resolve().parent.parent
114+
build_dir = root_dir / "build" / "cmake"
111115

112116
# Ensure the directory exists
113117
build_dir.mkdir(parents=True, exist_ok=True)

build_tools/pytorch.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010

1111
from .utils import (
1212
all_files_in_dir,
13-
cuda_version,
13+
cuda_archs,
1414
cuda_path,
15+
cuda_version,
1516
)
1617

1718

@@ -48,8 +49,6 @@ def setup_pytorch_extension(
4849
]
4950
nvcc_flags = [
5051
"-O3",
51-
"-gencode",
52-
"arch=compute_70,code=sm_70",
5352
"-U__CUDA_NO_HALF_OPERATORS__",
5453
"-U__CUDA_NO_HALF_CONVERSIONS__",
5554
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
@@ -61,6 +60,11 @@ def setup_pytorch_extension(
6160
"--use_fast_math",
6261
]
6362

63+
cuda_architectures = cuda_archs()
64+
65+
if "70" in cuda_architectures:
66+
nvcc_flags.extend(["-gencode", "arch=compute_70,code=sm_70"])
67+
6468
# Version-dependent CUDA options
6569
try:
6670
version = cuda_version()
@@ -73,17 +77,18 @@ def setup_pytorch_extension(
7377
(
7478
"--threads",
7579
os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
76-
"-gencode",
77-
"arch=compute_80,code=sm_80",
78-
"-gencode",
79-
"arch=compute_90,code=sm_90",
8080
)
8181
)
8282

83+
if "80" in cuda_architectures:
84+
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
85+
if "90" in cuda_architectures:
86+
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
87+
8388
# Libraries
8489
library_dirs = []
8590
libraries = []
86-
if os.getenv("NVTE_UB_WITH_MPI"):
91+
if bool(int(os.getenv("NVTE_UB_WITH_MPI", 0))):
8792
assert (
8893
os.getenv("MPI_HOME") is not None
8994
), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"

build_tools/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66

77
import functools
88
import glob
9+
import importlib
910
import os
1011
import re
1112
import shutil
1213
import subprocess
1314
import sys
14-
import importlib
1515
from pathlib import Path
1616
from subprocess import CalledProcessError
1717
from typing import List, Optional, Tuple, Union
@@ -188,6 +188,11 @@ def cuda_path() -> Tuple[str, str]:
188188
return cuda_home, nvcc_bin
189189

190190

191+
@functools.lru_cache(maxsize=None)
192+
def cuda_archs() -> str:
193+
return os.getenv("NVTE_CUDA_ARCHS", "70;80;89;90")
194+
195+
191196
def cuda_version() -> Tuple[int, ...]:
192197
"""CUDA Toolkit version as a (major, minor) tuple."""
193198
# Query NVCC for version info

docs/_templates/layout.html

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
{% extends "!layout.html" %}
2+
3+
{% block extrahead %}
4+
5+
<script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
6+
7+
{% endblock %}
8+
29
{% block sidebartitle %} {{ super() }}
310

411
<style>
@@ -83,8 +90,10 @@
8390
}
8491
</style>
8592

86-
{%- if nvidia_analytics_id %}
87-
<script type="text/javascript">_satellite.pageBottom();</script>
88-
{%- endif %}
93+
{% endblock %}
94+
95+
{% block footer %}
96+
97+
<script type="text/javascript">if (typeof _satellite !== “undefined”){ _satellite.pageBottom();}</script>
8998

9099
{% endblock %}

0 commit comments

Comments
 (0)