diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Driver/Options.h index a7eaf3336339b..f15670e35d4b7 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Driver/Options.h @@ -41,6 +41,7 @@ enum ClangVisibility { FlangOption = (1 << 4), FC1Option = (1 << 5), DXCOption = (1 << 6), + SYCLRTCOnlyOption = (1 << 7), }; enum ID { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4158599394cd3..b2688c4e4ec22 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -107,6 +107,8 @@ def FC1Option : OptionVisibility; // are made available when the driver is running in DXC compatibility mode. def DXCOption : OptionVisibility; +def SYCLRTCOnlyOption : OptionVisibility; + ///////// // Docs @@ -195,6 +197,11 @@ def sycl_Group : OptionGroup<"">, Group, DocName<"SYCL options">, Visibility<[ClangOption, CLOption]>; +def sycl_rtc_only_Group : OptionGroup<", + Group, + DocName<"SYCL RTC specific options">, + Visibility<[SYCLRTCOnlyOption]>; + def cuda_Group : OptionGroup<"">, Group, DocName<"CUDA options">, Visibility<[ClangOption, CLOption]>; @@ -7460,6 +7467,13 @@ def fsyclbin : Flag<["-"], "fsyclbin">, Alias, AliasArgs<["executable"]>; } // let Group = sycl_Group +let Visibility = [SYCLRTCOnlyOption] in { + let Group = sycl_rtc_only_Group in { + def auto_pch : Flag<["--"], "auto-pch">, + HelpText<"Enable Auto-PCH for SYCL RTC Compilation">; + } // let Group = sycl_rtc_only_Group +} // let Visibility = [SYCLRTCOnlyOption] + // FIXME: -fsycl-explicit-simd is deprecated. remove it when support is dropped. def : Flag<["-"], "fsycl-explicit-simd">, Flags<[Deprecated]>, Group, diff --git a/clang/include/clang/Frontend/PrecompiledPreamble.h b/clang/include/clang/Frontend/PrecompiledPreamble.h index 624df004bf89e..b920ae1595100 100644 --- a/clang/include/clang/Frontend/PrecompiledPreamble.h +++ b/clang/include/clang/Frontend/PrecompiledPreamble.h @@ -87,8 +87,8 @@ class PrecompiledPreamble { DiagnosticsEngine &Diagnostics, IntrusiveRefCntPtr VFS, std::shared_ptr PCHContainerOps, - bool StoreInMemory, StringRef StoragePath, - PreambleCallbacks &Callbacks); + bool StoreInMemory, StringRef StoragePath, PreambleCallbacks &Callbacks, + bool AllowASTWithErrors = true); PrecompiledPreamble(PrecompiledPreamble &&); PrecompiledPreamble &operator=(PrecompiledPreamble &&); diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp index 3f3fe3c9937e4..d04773fdf1a6e 100644 --- a/clang/lib/Frontend/PrecompiledPreamble.cpp +++ b/clang/lib/Frontend/PrecompiledPreamble.cpp @@ -249,9 +249,10 @@ class TempPCHFile { class PrecompilePreambleAction : public ASTFrontendAction { public: PrecompilePreambleAction(std::shared_ptr Buffer, bool WritePCHFile, - PreambleCallbacks &Callbacks) + PreambleCallbacks &Callbacks, + bool AllowASTWithErrors = true) : Buffer(std::move(Buffer)), WritePCHFile(WritePCHFile), - Callbacks(Callbacks) {} + Callbacks(Callbacks), AllowASTWithErrors(AllowASTWithErrors) {} std::unique_ptr CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override; @@ -287,16 +288,18 @@ class PrecompilePreambleAction : public ASTFrontendAction { bool WritePCHFile; // otherwise the PCH is written into the PCHBuffer only. std::unique_ptr FileOS; // null if in-memory PreambleCallbacks &Callbacks; + bool AllowASTWithErrors; }; class PrecompilePreambleConsumer : public PCHGenerator { public: PrecompilePreambleConsumer(PrecompilePreambleAction &Action, Preprocessor &PP, ModuleCache &ModCache, StringRef isysroot, - std::shared_ptr Buffer) + std::shared_ptr Buffer, + bool AllowASTWithErrors = true) : PCHGenerator(PP, ModCache, "", isysroot, std::move(Buffer), ArrayRef>(), - /*AllowASTWithErrors=*/true), + AllowASTWithErrors), Action(Action) {} bool HandleTopLevelDecl(DeclGroupRef DG) override { @@ -337,7 +340,8 @@ PrecompilePreambleAction::CreateASTConsumer(CompilerInstance &CI, Sysroot.clear(); return std::make_unique( - *this, CI.getPreprocessor(), CI.getModuleCache(), Sysroot, Buffer); + *this, CI.getPreprocessor(), CI.getModuleCache(), Sysroot, Buffer, + AllowASTWithErrors); } template bool moveOnNoError(llvm::ErrorOr Val, T &Output) { @@ -415,7 +419,8 @@ llvm::ErrorOr PrecompiledPreamble::Build( DiagnosticsEngine &Diagnostics, IntrusiveRefCntPtr VFS, std::shared_ptr PCHContainerOps, bool StoreInMemory, - StringRef StoragePath, PreambleCallbacks &Callbacks) { + StringRef StoragePath, PreambleCallbacks &Callbacks, + bool AllowASTWithErrors) { assert(VFS && "VFS is null"); auto PreambleInvocation = std::make_shared(Invocation); @@ -511,7 +516,7 @@ llvm::ErrorOr PrecompiledPreamble::Build( auto Act = std::make_unique( std::move(Buffer), /*WritePCHFile=*/Storage->getKind() == PCHStorage::Kind::TempFile, - Callbacks); + Callbacks, AllowASTWithErrors); if (!Act->BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0])) return BuildPreambleError::BeginSourceFileFailed; diff --git a/clang/test/Driver/sycl-unsupported.cpp b/clang/test/Driver/sycl-unsupported.cpp index 311efbecf8b6b..7caf761e05f91 100644 --- a/clang/test/Driver/sycl-unsupported.cpp +++ b/clang/test/Driver/sycl-unsupported.cpp @@ -64,6 +64,15 @@ // UNSUPPORTED_OPT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "[[OPT_CC1]]{{.*}}" // UNSUPPORTED_OPT: clang{{.*}} "-fsycl-is-host"{{.*}} "[[OPT_CC1]]{{.*}}" +// "--auto-pch" should only be enabled for SYCL RTC compilations, regular driver +// shouldn't know about it: +// +// RUN: not %clangxx -### %s --auto-pch 2>&1 | FileCheck %s --check-prefix AUTO_PCH +// RUN: not %clangxx -fsycl-device-only -### %s --auto-pch 2>&1 | FileCheck %s --check-prefix AUTO_PCH +// RUN: not %clangxx -fsycl -### %s --auto-pch 2>&1 | FileCheck %s --check-prefix AUTO_PCH +// +// AUTO_PCH: error: unknown argument: '--auto-pch' + // FPGA support has been removed, usage of any FPGA specific options and any // options that have FPGA specific arguments should emit a specific error // diagnostic. diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index c2bc9778036ca..3b3094173dda8 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,12 @@ class SYCLToolchain { } } + struct PrecompiledPreambles { + using key = std::pair; + std::mutex Mutex; + std::map> PreamblesMap; + }; + // Similar to FrontendActionFactory, but we don't take ownership of // `FrontendAction`, nor do we create copies of it as we only perform a single // `ToolInvocation`. @@ -117,16 +124,135 @@ class SYCLToolchain { } }; + std::vector createCommandLine(const InputArgList &UserArgList, + BinaryFormat Format, + std::string_view SourceFilePath) { + DerivedArgList DAL{UserArgList}; + const auto &OptTable = getDriverOptTable(); + DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_fsycl_device_only)); + // User args may contain options not intended for the frontend, but we can't + // claim them here to tell the driver they're used later. Hence, suppress + // the unused argument warning. + DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); + + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + auto [CPU, Features] = + Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + (void)Features; + StringRef OT = Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda" + : "amdgcn-amd-amdhsa"; + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); + } + + ArgStringList ASL; + for (Arg *A : DAL) + A->render(DAL, ASL); + for (Arg *A : UserArgList) { + Option Group = A->getOption().getGroup(); + if (Group.isValid() && Group.getID() == OPT_sycl_rtc_only_Group) + continue; + + A->render(UserArgList, ASL); + } + + std::vector CommandLine; + CommandLine.reserve(ASL.size() + 2); + CommandLine.emplace_back(ClangXXExe); + transform(ASL, std::back_inserter(CommandLine), + [](const char *AS) { return std::string{AS}; }); + CommandLine.emplace_back(SourceFilePath); + return CommandLine; + } + + class ActionWithPCHPreamble : public Action { + std::string CmdLineOpts; + + public: + ActionWithPCHPreamble(FrontendAction &FEAction, std::string &&CmdLineOpts) + : Action(FEAction), CmdLineOpts(std::move(CmdLineOpts)) {} + + bool runInvocation(std::shared_ptr Invocation, + FileManager *Files, + std::shared_ptr PCHContainerOps, + DiagnosticConsumer *DiagConsumer) override { + auto MainFilePath = Invocation->getFrontendOpts().Inputs[0].getFile(); + auto MainFileBuffer = Files->getBufferForFile(MainFilePath); + assert(MainFileBuffer && "Can't get memory buffer for in-memory source?"); + + PreambleBounds Bounds = ComputePreambleBounds( + Invocation->getLangOpts(), **MainFileBuffer, 100 /* MaxLines */); + + PrecompiledPreambles::key key{ + std::move(CmdLineOpts), + (*MainFileBuffer)->getBuffer().substr(0, Bounds.Size).str()}; + + std::shared_ptr Preamble; + { + PrecompiledPreambles &Preambles = SYCLToolchain::instance().Preambles; + std::lock_guard Lock{Preambles.Mutex}; + auto [It, Inserted] = Preambles.PreamblesMap.try_emplace(key); + + if (Inserted) { + PreambleCallbacks Callbacks; + auto DiagIds = llvm::makeIntrusiveRefCnt(); + auto DiagOpts = Invocation->getDiagnosticOpts(); + auto Diags = llvm::makeIntrusiveRefCnt( + DiagIds, DiagOpts, DiagConsumer, false); + + static std::string StoragePath = + (SYCLToolchain::instance().getPrefix() + "/preambles").str(); + llvm::ErrorOr NewPreamble = + PrecompiledPreamble::Build( + *Invocation, MainFileBuffer->get(), Bounds, *Diags, + Files->getVirtualFileSystemPtr(), PCHContainerOps, + /*StorePreamblesInMemory*/ true, StoragePath, Callbacks, + /*AllowASTWithErrors=*/false); + + if (!NewPreamble) + return false; + + It->second = std::make_shared( + std::move(NewPreamble.get())); + } + + Preamble = It->second; + } // End lock + + assert(Preamble); + assert(Preamble->CanReuse(*Invocation, **MainFileBuffer, Bounds, + Files->getVirtualFileSystem())); + + // FIXME: WHY release???? + auto Buf = llvm::MemoryBuffer::getMemBufferCopy( + (*MainFileBuffer)->getBuffer(), MainFilePath) + .release(); + + auto VFS = Files->getVirtualFileSystemPtr(); + Preamble->AddImplicitPreamble(*Invocation, VFS, Buf); + auto NewFiles = makeIntrusiveRefCnt( + Files->getFileSystemOpts(), std::move(VFS)); + + return Action::runInvocation(std::move(Invocation), NewFiles.get(), + std::move(PCHContainerOps), DiagConsumer); + } + }; + public: static SYCLToolchain &instance() { static SYCLToolchain Instance; return Instance; } - bool run(const std::vector &CommandLine, - FrontendAction &FEAction, + bool run(const InputArgList &UserArgList, BinaryFormat Format, + const char *SourceFilePath, FrontendAction &FEAction, IntrusiveRefCntPtr FSOverlay = nullptr, - DiagnosticConsumer *DiagConsumer = nullptr) { + DiagnosticConsumer *DiagConsumer = nullptr, + bool UseAutoPCH = false) { + std::vector CommandLine = + createCommandLine(UserArgList, Format, SourceFilePath); + auto FS = llvm::makeIntrusiveRefCnt( llvm::vfs::getRealFileSystem()); FS->pushOverlay(ToolchainFS); @@ -136,9 +262,14 @@ class SYCLToolchain { auto Files = llvm::makeIntrusiveRefCnt( clang::FileSystemOptions{"." /* WorkingDir */}, FS); - Action A{FEAction}; - ToolInvocation TI{CommandLine, &A, Files.get(), - std::make_shared()}; + Action Normal{FEAction}; + ActionWithPCHPreamble WithPreamble{FEAction, + join(drop_end(CommandLine, 1), " ")}; + ToolInvocation TI{CommandLine, + UseAutoPCH ? static_cast(&WithPreamble) + : &Normal, + Files.get(), std::make_shared()}; + TI.setDiagnosticConsumer(DiagConsumer ? DiagConsumer : &IgnoreDiag); return TI.run(); @@ -178,6 +309,8 @@ class SYCLToolchain { std::string ClangXXExe = (Prefix + "/bin/clang++").str(); llvm::IntrusiveRefCntPtr ToolchainFS = llvm::makeIntrusiveRefCnt(); + + PrecompiledPreambles Preambles; }; class ClangDiagnosticWrapper { @@ -226,42 +359,6 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace -static std::vector -createCommandLine(const InputArgList &UserArgList, BinaryFormat Format, - std::string_view SourceFilePath) { - DerivedArgList DAL{UserArgList}; - const auto &OptTable = getDriverOptTable(); - DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_fsycl_device_only)); - // User args may contain options not intended for the frontend, but we can't - // claim them here to tell the driver they're used later. Hence, suppress the - // unused argument warning. - DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); - - if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { - auto [CPU, Features] = - Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); - (void)Features; - StringRef OT = Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda" - : "amdgcn-amd-amdhsa"; - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), OT); - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), OT); - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); - } - - ArgStringList ASL; - for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); - for_each(UserArgList, - [&UserArgList, &ASL](Arg *A) { A->render(UserArgList, ASL); }); - - std::vector CommandLine; - CommandLine.reserve(ASL.size() + 2); - CommandLine.emplace_back(SYCLToolchain::instance().getClangXXExe()); - transform(ASL, std::back_inserter(CommandLine), - [](const char *AS) { return std::string{AS}; }); - CommandLine.emplace_back(SourceFilePath); - return CommandLine; -} - static llvm::IntrusiveRefCntPtr getInMemoryFS(InMemoryFile SourceFile, View IncludeFiles) { auto InMemoryFS = llvm::makeIntrusiveRefCnt(); @@ -283,9 +380,6 @@ Expected jit_compiler::calculateHash( const InputArgList &UserArgList, BinaryFormat Format) { TimeTraceScope TTS{"calculateHash"}; - std::vector CommandLine = - createCommandLine(UserArgList, Format, SourceFile.Path); - class HashPreprocessedAction : public PreprocessorFrontendAction { protected: void ExecuteAction() override { @@ -315,7 +409,8 @@ Expected jit_compiler::calculateHash( BLAKE3 Hasher; HashPreprocessedAction HashAction{Hasher}; - if (!SYCLToolchain::instance().run(CommandLine, HashAction, + if (!SYCLToolchain::instance().run(UserArgList, Format, SourceFile.Path, + HashAction, getInMemoryFS(SourceFile, IncludeFiles))) return createStringError("Calculating source hash failed"); @@ -324,10 +419,11 @@ Expected jit_compiler::calculateHash( ArrayRef{reinterpret_cast(&Format), reinterpret_cast(&Format + 1)}); - // Last argument is "rtc_N.cpp" source file name which is never the same, - // ignore it: - for (auto &Opt : drop_end(CommandLine, 1)) - Hasher.update(Opt); + for (Arg *Opt : UserArgList) { + Hasher.update(Opt->getSpelling()); + for (const char *Val : Opt->getValues()) + Hasher.update(Val); + } std::string EncodedHash = encodeBase64(Hasher.result()); @@ -346,9 +442,11 @@ Expected jit_compiler::compileDeviceCode( DiagnosticOptions DiagOpts; ClangDiagnosticWrapper Wrapper(BuildLog, &DiagOpts); - if (SYCLToolchain::instance().run( - createCommandLine(UserArgList, Format, SourceFile.Path), ELOA, - getInMemoryFS(SourceFile, IncludeFiles), Wrapper.consumer())) { + bool AutoPCH = UserArgList.hasArg(OPT_auto_pch); + + if (SYCLToolchain::instance().run(UserArgList, Format, SourceFile.Path, ELOA, + getInMemoryFS(SourceFile, IncludeFiles), + Wrapper.consumer(), AutoPCH)) { return ELOA.takeModule(); } else { return createStringError(BuildLog); diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc index d21c5ff4c4394..bf628a0ce979a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc @@ -1127,6 +1127,58 @@ build_options{{ Relax the requirement that parameter types for free-function kernels must be forward-declarable. +===== `--auto-pch` + +Enable auto-detection of the preamble and use it as a pre-compiled header to +speed up subsequent compilations of TUs matching the preamble/compilation +options. Example of the code that can benefit from this: + +[source,c++] +---- +#include +#include + +// Auto-detected preamble ends before next line: +namespace syclext = sycl::ext::oneapi; +namespace syclexp = sycl::ext::oneapi::experimental; + +extern "C" +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void iota(float start, float *ptr) { + size_t id = syclext::this_work_item::get_nd_item<1>().get_global_linear_id(); + ptr[id] = start + static_cast(id); +} +---- + +Limitations: + +* Preamble detection is done at the Lexer level and can't handle code like + +[source,c++] +---- +#if 1 +#include +#else +// Auto-detected preamble ends in the middle of `#else` and would fail to compile. +void foo() {} +#endif +---- + +* Any changes in either preamble or compilation options (including + `-DSOMETHING`!) result in a creation of a new pre-compiled header/preamble. + +* No support (including not reporting any errors) for `+__DATE__+`/`+__TIME__+` + macros inside auto-detected preamble (transitively in regards to the + includes). + +* Files used inside preamble must not change between different compilations (at + least for the same auto-detected preamble). + +* Auto-generated pre-compiled headers/preambles are stored in memory only. That means: + - No persistency between invocations + - Currently there is no eviction mechanism, so application is expected to use + the option only when number of preambles is limited. + === Known issues and limitations when the language is `sycl` ==== Changing the compiler action or output diff --git a/sycl/test-e2e/PerformanceTests/KernelCompiler/auto-pch.cpp b/sycl/test-e2e/PerformanceTests/KernelCompiler/auto-pch.cpp new file mode 100644 index 0000000000000..7f9b6a60b7d7b --- /dev/null +++ b/sycl/test-e2e/PerformanceTests/KernelCompiler/auto-pch.cpp @@ -0,0 +1,83 @@ +// RUN: %{build} -O3 -o %t.out +// RUN: %{run} %t.out + +// UNSUPPORTED: target-native_cpu + +#include +#include + +#include +#include +#include +#include + +using namespace std::string_view_literals; +namespace syclexp = sycl::ext::oneapi::experimental; + +void run(std::vector ExtraHeaders) { + std::stringstream src; + + // These are necessary: + src << R"""( +#include +#include +)"""; + + for (std::string_view Header : ExtraHeaders) + src << "#include <" << Header << ">\n"; + + src << R"""( +namespace syclext = sycl::ext::oneapi; +namespace syclexp = sycl::ext::oneapi::experimental; + +extern "C" +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void iota(float start, float *ptr) { + size_t id = syclext::this_work_item::get_nd_item<1>().get_global_linear_id(); + ptr[id] = start + static_cast(id); +} +)"""; + + std::string src_str = src.str(); + + sycl::queue q; + + auto Run = [&](auto props) { + for (int i = 0; i < 5; ++i) { + auto t1 = std::chrono::high_resolution_clock::now(); + sycl::kernel_bundle kb_src = + syclexp::create_kernel_bundle_from_source( + q.get_context(), syclexp::source_language::sycl, src_str); + sycl::kernel_bundle kb_exe = + syclexp::build(kb_src, props); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration iter_duration = t2 - t1; + std::cout << static_cast(iter_duration.count()) << "ms" << " "; + } + }; + + if (ExtraHeaders.empty()) + std::cout << " "; + for (std::string_view Header : ExtraHeaders) + std::cout << Header << " "; + std::cout << "| "; + Run(syclexp::properties{}); + std::cout << "| "; + Run(syclexp::properties{ + syclexp::build_options{std::vector{"--auto-pch"}}}); + std::cout << std::endl; +} + +int main(int argc, char **argv) { + // So that output could be copy-pasted into GH comments and rendered as a + // table: + std::cout << "Extra Headers | Without PCH | With auto-PCH" << std::endl; + std::cout << "-|-|-" << std::endl; + run({}); + run({"sycl/half_type.hpp"}); + run({"sycl/ext/oneapi/bfloat16.hpp"}); + run({"sycl/marray.hpp"}); + run({"sycl/vector.hpp"}); + run({"sycl/multi_ptr.hpp"}); + run({"sycl/builtins.hpp"}); +}