1 file changed
+1
-1
lines changedSubmodule cudnn-frontend updated 67 files
- CMakeLists.txt+47-5
- README.md+5
- benchmark/Llama-3.2-1B-Training/Dockerfile+6
- benchmark/Llama-3.2-1B-Training/README.md+152
- benchmark/Llama-3.2-1B-Training/artifacts/b200_h200_speedup.png
- benchmark/Llama-3.2-1B-Training/artifacts/b200_iteration_time.png
- benchmark/Llama-3.2-1B-Training/artifacts/b200_run_plot.b200.txt+2.0k
- benchmark/Llama-3.2-1B-Training/artifacts/b200_speedup.png
- benchmark/Llama-3.2-1B-Training/artifacts/b200_training_timing.csv+19
- benchmark/Llama-3.2-1B-Training/artifacts/cudnn_attention_time.png
- benchmark/Llama-3.2-1B-Training/artifacts/efficient_attention_time.png
- benchmark/Llama-3.2-1B-Training/artifacts/flash_attention_time.png
- benchmark/Llama-3.2-1B-Training/artifacts/h200_iteration_time.png
- benchmark/Llama-3.2-1B-Training/artifacts/h200_speedup.png
- benchmark/Llama-3.2-1B-Training/artifacts/h200_training_timing.csv+19
- benchmark/Llama-3.2-1B-Training/training_perf.py+129
- benchmark/sdpa_benchmark/Dockerfile
- benchmark/sdpa_benchmark/README.md
- benchmark/sdpa_benchmark/benchmark_flash_attention.py
- benchmark/sdpa_benchmark/benchmark_results.csv
- benchmark/sdpa_benchmark/images/bprop.png
- benchmark/sdpa_benchmark/images/forward.png
- benchmark/sdpa_benchmark/images/fwd_bprop.png
- cudnn_frontend-config.cmake.in+3
- include/cudnn_frontend/backend/kernel_cache.h+1-1
- include/cudnn_frontend/graph_interface.h+2-2
- include/cudnn_frontend/graph_properties.h+2-2
- include/cudnn_frontend/node/scaled_dot_product_flash_attention.h+1-1
- python/CMakeLists.txt+32-11
- python/properties.cpp+1-1
- python/pygraph/norm.cpp+2-2
- requirements.txt+1-1
- samples/cpp/CMakeLists.txt+2
- samples/cpp/convolution/dgrads.cpp+3-3
- samples/cpp/convolution/fp8_fprop.cpp+1-1
- samples/cpp/convolution/fprop.cpp+6-6
- samples/cpp/convolution/int8_fprop.cpp+1-1
- samples/cpp/convolution/wgrads.cpp+2-2
- samples/cpp/matmul/fp8_matmul.cpp+1-1
- samples/cpp/matmul/int8_matmul.cpp+1-1
- samples/cpp/matmul/matmuls.cpp+4-4
- samples/cpp/matmul/mixed_matmul.cpp+1-1
- samples/cpp/matmul/nvfp4_mxfp8_matmul.cpp+1-1
- samples/cpp/misc/pointwise.cpp+3-3
- samples/cpp/misc/resample.cpp+3-3
- samples/cpp/misc/serialization.cpp+2-2
- samples/cpp/misc/slice.cpp+1-1
- samples/cpp/misc/sm_carveout.cpp+1-1
- samples/cpp/norm/batchnorm.cpp+4-4
- samples/cpp/norm/layernorm.cpp+4-4
- samples/cpp/norm/rmsnorm.cpp+3-3
- samples/cpp/sdpa/fp16_bwd.cpp+1-1
- samples/cpp/sdpa/fp16_bwd_with_cudagraphs.cpp+1-1
- samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp+1-1
- samples/cpp/sdpa/fp16_cached.cpp+1-1
- samples/cpp/sdpa/fp16_fwd.cpp+1-1
- samples/cpp/sdpa/fp16_fwd_paged_decode_and_prefill.cpp+1-1
- samples/cpp/sdpa/fp16_fwd_with_cudagraphs.cpp+1-1
- samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp+1-1
- samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp+1-1
- samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp+1-1
- samples/cpp/sdpa/fp8_bwd.cpp+2-2
- samples/cpp/sdpa/fp8_bwd_bottom_right_causal_mask.cpp+1-1
- samples/cpp/sdpa/fp8_fwd.cpp+1-1
- samples/cpp/sdpa/fp8_fwd_bottom_right_causal_mask.cpp+1-1
- samples/legacy_samples/CMakeLists.txt+2
- test/cpp/CMakeLists.txt+2
0 commit comments