[SYCL] Support per-object file compilation (#7595)

sarnex · web-flow · commit f884993dc48d · 2022-12-12T17:19:14.000+01:00
This change adds per-object compilation support for SYCL, also called
non-relocatable device code mode. This is already supported in clang for
HIP and CUDA.

It adds a new option -f[no-]sycl-rdc. The default is -fsycl-rdc, which
compiles code as today. Passing -fno-sycl-rdc activates the new mode.
This is just an alias to the existing flag used by AMD/CUDA,
f[no-]-gpu-rdc.

The main implication is that we no longer link all device code together
into one big module before post link.
Instead, we execute all jobs after device linking on a per-object file
basis.
This means sycl-post-link and the later jobs execute multiple times,
since we no longer have one big module.

This can result in large improvement performance in the compiler runtime
and memory usage, we see a max memory usage reduction for QUDA with -g
from over 250GB to 4GB and a large compiler runtime improvement as well.

Error cases:
1) Cross-object dependencies. Since we don't link device code together,
each object file must be independent. I added an error in Sema to error
if the user passes this flag and has cross-object dependencies.
2) Invalid architecture in fat object. We currently warn gracefully
about this, in per-object-file mode llvm-foreach throws an error
customers won't understand, so error out in that case instead of
warning.

Signed-off-by: Sarnie, Nick &lt;nick.sarnie@intel.com&gt;
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -352,6 +352,8 @@ def warn_drv_sycl_offload_target_duplicate : Warning<
 def warn_drv_sycl_target_missing : Warning<
   "linked binaries do not contain expected '%0' target; found targets: '%1'">,
   InGroup<SyclTarget>;
+def err_drv_no_rdc_sycl_target_missing : Error<
+  "linked binaries do not contain expected '%0' target; found targets: '%1', this is not supported with '-fno-sycl-rdc'">;
 def err_drv_multiple_target_with_forced_target : Error<
   "multiple target usage with '%0' is not supported with '%1'">;
 def err_drv_failed_to_deduce_target_from_arch : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11790,6 +11790,9 @@ def err_sycl_restrict : Error<
 def err_sycl_external_global : Error<
   "invalid reference to 'device_global' variable; external 'device_global'"
   " variable must be marked with SYCL_EXTERNAL macro">;
+def err_sycl_external_no_rdc : Error<
+  "invalid %select{declaration|definition}0 of SYCL_EXTERNAL function in non-relocatable "
+   "device code mode">;
 def warn_sycl_kernel_too_big_args : Warning<
   "size of kernel arguments (%0 bytes) may exceed the supported maximum "
   "of %1 bytes on some devices">, InGroup<SyclStrict>, ShowInSystemHeader;
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
@@ -14,6 +14,7 @@
 #include "clang/Driver/Util.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
@@ -817,7 +818,8 @@ class FileTableTformJobAction : public JobAction {
       REPLACE,
       REPLACE_CELL,
       RENAME,
-      COPY_SINGLE_FILE
+      COPY_SINGLE_FILE,
+      MERGE
     };
 
     Tform() = default;
@@ -855,6 +857,10 @@ class FileTableTformJobAction : public JobAction {
   // output file.
   void addCopySingleFileTform(StringRef ColumnName, int Row);
 
+  // Merges all tables from filename listed at column <ColumnName> into a
+  // single output table.
+  void addMergeTform(StringRef ColumnName);
+
   static bool classof(const Action *A) {
     return A->getKind() == FileTableTformJobClass;
   }
@@ -937,6 +943,14 @@ class ForEachWrappingAction : public Action {
   static bool classof(const Action *A) {
     return A->getKind() == ForEachWrappingClass;
   }
+
+  void addSerialAction(const Action *A) { SerialActions.insert(A); }
+  const llvm::SmallSetVector<const Action *, 2> &getSerialActions() const {
+    return SerialActions;
+  }
+
+private:
+  llvm::SmallSetVector<const Action *, 2> SerialActions;
 };
 
 } // namespace driver
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -1005,8 +1005,9 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-
   Alias<fno_gpu_flush_denormals_to_zero>;
 defm gpu_rdc : BoolFOption<"gpu-rdc",
   LangOpts<"GPURelocatableDeviceCode">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Generate relocatable device code, also known as separate compilation mode">,
-  NegFlag<SetFalse>>;
+  PosFlag<SetTrue, [], "Generate relocatable device code, also known as separate compilation mode">,
+  NegFlag<SetFalse, []>,
+  BothFlags<[CC1Option]>>;
 def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
 def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
 defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
@@ -2997,6 +2998,8 @@ def fsycl_max_parallel_jobs_EQ : Joined<["-"], "fsycl-max-parallel-link-jobs=">,
   HelpText<"Experimental feature: Controls the maximum parallelism of actions performed "
   "on SYCL device code post-link, i.e. the generation of SPIR-V device images "
   "or AOT compilation of each device image.">;
+def : Flag<["-"], "fsycl-rdc">, Alias<fgpu_rdc>;
+def : Flag<["-"], "fno-sycl-rdc">, Alias<fno_gpu_rdc>;
 def fsyntax_only : Flag<["-"], "fsyntax-only">,
   Flags<[NoXarchOption,CoreOption,CC1Option,FC1Option,FlangOption]>, Group<Action_Group>,
   HelpText<"Run the preprocessor, parser and semantic analysis stages">;
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
@@ -567,6 +567,10 @@ void FileTableTformJobAction::addCopySingleFileTform(StringRef ColumnName,
       Tform(Tform::COPY_SINGLE_FILE, {ColumnName, std::to_string(Row)}));
 }
 
+void FileTableTformJobAction::addMergeTform(StringRef ColumnName) {
+  Tforms.emplace_back(Tform(Tform::MERGE, {ColumnName}));
+}
+
 void AppendFooterJobAction::anchor() {}
 
 AppendFooterJobAction::AppendFooterJobAction(Action *Input, types::ID Type)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4782,7 +4782,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                     options::OPT_no_offload_new_driver, false));
 
   bool IsRDCMode =
-      Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+      Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, IsSYCL);
   bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction);
   auto LTOMode = D.getLTOMode(IsDeviceOffloadAction);
   bool IsFPGASYCLOffloadDevice =
@@ -6920,9 +6920,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                       options::OPT_fno_hip_kernel_arg_name);
   }
 
-  if (IsCuda || IsHIP) {
+  if (IsCuda || IsHIP || IsSYCL) {
     if (IsRDCMode)
       CmdArgs.push_back("-fgpu-rdc");
+    else
+      CmdArgs.push_back("-fno-gpu-rdc");
+  }
+  if (IsCuda || IsHIP) {
     if (Args.hasFlag(options::OPT_fgpu_defer_diag,
                      options::OPT_fno_gpu_defer_diag, false))
       CmdArgs.push_back("-fgpu-defer-diag");
@@ -9770,6 +9774,13 @@ void FileTableTform::ConstructJob(Compilation &C, const JobAction &JA,
       addArgs(CmdArgs, TCArgs, {Arg});
       break;
     }
+    case FileTableTformJobAction::Tform::MERGE: {
+      assert(Tf.TheArgs.size() == 1 && "column name expected");
+      SmallString<128> Arg("-merge=");
+      Arg += Tf.TheArgs[0];
+      addArgs(CmdArgs, TCArgs, {Arg});
+      break;
+    }
     }
   }
 
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -127,6 +127,11 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
   C.addCommand(std::move(Cmd));
 }
 
+bool SYCL::shouldDoPerObjectFileLinking(const Compilation &C) {
+  return !C.getArgs().hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                              /*default=*/true);
+}
+
 // The list should match pre-built SYCL device library files located in
 // compiler package. Once we add or remove any SYCL device library files,
 // the list should be updated accordingly.
@@ -163,12 +168,25 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
   // an actual object/archive.  Take that list and pass those to the linker
   // instead of the original object.
   if (JA.isDeviceOffloading(Action::OFK_SYCL)) {
-    auto isSYCLDeviceLib = [&C, this](const InputInfo &II) {
+    bool IsRDC = !shouldDoPerObjectFileLinking(C);
+    auto isNoRDCDeviceCodeLink = [&](const InputInfo &II) {
+      if (IsRDC)
+        return false;
+      if (II.getType() != clang::driver::types::TY_LLVM_BC)
+        return false;
+      if (InputFiles.size() != 2)
+        return false;
+      return &II == &InputFiles[1];
+    };
+    auto isSYCLDeviceLib = [&](const InputInfo &II) {
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
       StringRef LibPostfix = ".o";
       if (HostTC->getTriple().isWindowsMSVCEnvironment() &&
           C.getDriver().IsCLMode())
         LibPostfix = ".obj";
+      else if (isNoRDCDeviceCodeLink(II))
+        LibPostfix = ".bc";
+
       std::string FileName = this->getToolChain().getInputFilename(II);
       StringRef InputFilename = llvm::sys::path::filename(FileName);
       if (this->getToolChain().getTriple().isNVPTX()) {
@@ -183,9 +201,21 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
           !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2))
         return false;
       // Skip the prefix "libsycl-"
-      StringRef PureLibName = InputFilename.substr(LibSyclPrefix.size());
+      std::string PureLibName =
+          InputFilename.substr(LibSyclPrefix.size()).str();
+      if (isNoRDCDeviceCodeLink(II)) {
+        // Skip the final - until the . because we linked all device libs into a
+        // single BC in a previous action so we have a temp file name.
+        auto FinalDashPos = PureLibName.find_last_of('-');
+        auto DotPos = PureLibName.find_last_of('.');
+        assert((FinalDashPos != std::string::npos &&
+                DotPos != std::string::npos) &&
+               "Unexpected filename");
+        PureLibName =
+            PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos);
+      }
       for (const auto &L : SYCLDeviceLibList) {
-        if (PureLibName.startswith(L))
+        if (StringRef(PureLibName).startswith(L))
           return true;
       }
       return false;
@@ -203,8 +233,17 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
     for (const auto &II : InputFiles) {
       std::string FileName = getToolChain().getInputFilename(II);
       if (II.getType() == types::TY_Tempfilelist) {
-        // Pass the unbundled list with '@' to be processed.
-        Libs.push_back(C.getArgs().MakeArgString("@" + FileName));
+        if (IsRDC) {
+          // Pass the unbundled list with '@' to be processed.
+          Libs.push_back(C.getArgs().MakeArgString("@" + FileName));
+        } else {
+          assert(InputFiles.size() == 2 &&
+                 "Unexpected inputs for no-RDC with temp file list");
+          // If we're in no-RDC mode and the input is a temp file list,
+          // we want to link multiple object files each against device libs,
+          // so we should consider this input as an object and not pass '@'.
+          Objs.push_back(C.getArgs().MakeArgString(FileName));
+        }
       } else if (II.getType() == types::TY_Archive && !LinkSYCLDeviceLibs) {
         Libs.push_back(C.getArgs().MakeArgString(FileName));
       } else
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
@@ -38,7 +38,7 @@ void constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
                                  const InputInfo &Output, const Tool *T,
                                  StringRef Increment, StringRef Ext = "out",
                                  StringRef ParallelJobs = "");
-
+bool shouldDoPerObjectFileLinking(const Compilation &C);
 // Runs llvm-spirv to convert spirv to bc, llvm-link, which links multiple LLVM
 // bitcode. Converts generated bc back to spirv using llvm-spirv, wraps with
 // offloading information. Finally compiles to object using llc
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3613,6 +3613,9 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
 
   if (!Opts.RandstructSeed.empty())
     GenerateArg(Args, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed, SA);
+
+  if (!Opts.GPURelocatableDeviceCode)
+    GenerateArg(Args, OPT_fno_gpu_rdc, SA);
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4208,6 +4211,11 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
       Diags.Report(diag::err_drv_hlsl_unsupported_target) << T.str();
   }
 
+  // GPURelocatableDeviceCode should be true for SYCL if not specified.
+  if (Args.hasArg(OPT_fsycl_is_device) || Args.hasArg(OPT_fsycl_is_host))
+    Opts.GPURelocatableDeviceCode = Args.hasFlag(
+        options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, /*default=*/true);
+
   return Diags.getNumErrors() == NumErrorsBefore;
 }
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
@@ -10240,6 +10240,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
+  if (getLangOpts().SYCLIsDevice && !getLangOpts().GPURelocatableDeviceCode &&
+      NewFD->hasAttr<SYCLDeviceAttr>() &&
+      !getSourceManager().isInSystemHeader(NewFD->getLocation())) {
+    Diag(NewFD->getLocation(), diag::err_sycl_external_no_rdc)
+        << (D.getFunctionDefinitionKind() ==
+            clang::FunctionDefinitionKind::Definition);
+    NewFD->setInvalidDecl();
+  }
+
   if (!getLangOpts().CPlusPlus) {
     // Perform semantic checking on the function declaration.
     if (!NewFD->isInvalidDecl() && NewFD->isMain())
diff --git a/clang/test/Driver/sycl-no-rdc-errors.cpp b/clang/test/Driver/sycl-no-rdc-errors.cpp
@@ -0,0 +1,12 @@
+/// Tests driver errors for -no-sycl-rdc
+
+// RUN: %clang -target %itanium_abi_triple -c %s -o %t.o
+// RUN: %clang -target spir64_gen -emit-llvm -c %s -o %t
+// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,sycl-spir64_gen-unknown-unknown -input=%t -input=%t.o -output=%t.fat.o
+// RUN: not %clang -### -fsycl -fno-sycl-rdc %t.fat.o 2>&1 | FileCheck -check-prefix=CHECK-ARCH %s
+
+// CHECK-ARCH: error: linked binaries do not contain expected 'spir64-unknown-unknown' target; found targets: 'spir64_gen-unknown-unknown', this is not supported with '-fno-sycl-rdc'
+
+// Some code so that we can create a binary out of this file.
+void test_func(void) {
+}
diff --git a/clang/test/Driver/sycl-no-rdc-fat-archive.cpp b/clang/test/Driver/sycl-no-rdc-fat-archive.cpp
@@ -0,0 +1,29 @@
+/// test behaviors of passing a fat static lib with -fno-sycl-rdc
+// Build a fat static lib that will be used for all tests
+// RUN: echo "void foo(void) {}" > %t1.cpp
+// RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl %t1.cpp -c -o %t1_bundle.o
+// RUN: llvm-ar cr %t_lib.a %t1_bundle.o
+// RUN: %clang -### -fsycl -fno-sycl-rdc -fsycl-device-code-split=none --sysroot=%S/Inputs/SYCL %t_lib.a 2>&1 -ccc-print-phases | FileCheck %s
+// RUN: %clang -### -fsycl -fno-sycl-rdc -fsycl-device-code-split=auto --sysroot=%S/Inputs/SYCL %t_lib.a 2>&1 -ccc-print-phases | FileCheck %s
+// RUN: %clang -### -fsycl -fno-sycl-rdc -fsycl-device-code-split=per_kernel --sysroot=%S/Inputs/SYCL %t_lib.a 2>&1 -ccc-print-phases | FileCheck %s
+// RUN: %clang -### -fsycl -fno-sycl-rdc -fsycl-device-code-split=per_source --sysroot=%S/Inputs/SYCL %t_lib.a 2>&1 -ccc-print-phases | FileCheck %s
+// CHECK: 2: input, "{{.*}}_lib.a", archive
+// CHECK: 3: clang-offload-unbundler, {2}, tempfilelist
+// CHECK: 4: spirv-to-ir-wrapper, {3}, tempfilelist, (device-sycl)
+// CHECK: 5: input, "{{.*}}libsycl-crt{{.*}}", object
+// CHECK: 6: clang-offload-unbundler, {5}, object
+// CHECK: 7: offload, " (spir64-unknown-unknown)" {6}, object
+// CHECK: 65: linker, {7, {{.*}}}, ir, (device-sycl)
+// CHECK: 66: linker, {4, 65}, ir, (device-sycl)
+// CHECK: 67: foreach, {4, 66}, ir, (device-sycl)
+// CHECK: 68: file-table-tform, {4, 67}, tempfilelist, (device-sycl)
+// CHECK: 69: sycl-post-link, {68}, tempfiletable, (device-sycl)
+// CHECK: 70: foreach, {68, 69}, tempfiletable, (device-sycl)
+// CHECK: 71: file-table-tform, {70}, tempfilelist, (device-sycl)
+// CHECK: 72: file-table-tform, {70}, tempfilelist, (device-sycl)
+// CHECK: 73: foreach, {68, 72}, tempfilelist, (device-sycl)
+// CHECK: 74: file-table-tform, {73}, tempfilelist, (device-sycl)
+// CHECK: 75: llvm-spirv, {74}, tempfilelist, (device-sycl)
+// CHECK: 76: file-table-tform, {71, 75}, tempfiletable, (device-sycl)
+// CHECK: 77: clang-offload-wrapper, {76}, object, (device-sycl)
+// CHECK: 78: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (spir64-unknown-unknown)" {77}, image
diff --git a/clang/test/Driver/sycl-no-rdc.cpp b/clang/test/Driver/sycl-no-rdc.cpp
@@ -0,0 +1,31 @@
+/// Tests for -fno-sycl-rdc
+// RUN: touch %t1.cpp
+// RUN: touch %t2.cpp
+// RUN: %clang -### -fsycl -fno-sycl-rdc --sysroot=%S/Inputs/SYCL %t1.cpp %t2.cpp 2>&1 -ccc-print-phases | FileCheck %s
+
+// CHECK: 3: input, "{{.*}}1.cpp", c++, (device-sycl)
+// CHECK: 4: preprocessor, {3}, c++-cpp-output, (device-sycl)
+// CHECK: 5: compiler, {4}, ir, (device-sycl)
+// CHECK: 13: input, "{{.*}}2.cpp", c++, (device-sycl)
+// CHECK: 14: preprocessor, {13}, c++-cpp-output, (device-sycl)
+// CHECK: 15: compiler, {14}, ir, (device-sycl)
+
+// CHECK: 21: input, {{.*}}libsycl-crt{{.*}}, object
+// CHECK: 22: clang-offload-unbundler, {21}, object
+// CHECK: 23: offload, " (spir64-unknown-unknown)" {22}, object
+// CHECK: 81: linker, {23, {{.*}}}, ir, (device-sycl)
+// CHECK: 82: linker, {5, 81}, ir, (device-sycl)
+// CHECK: 83: sycl-post-link, {82}, tempfiletable, (device-sycl)
+// CHECK: 84: file-table-tform, {83}, tempfilelist, (device-sycl)
+// CHECK: 85: llvm-spirv, {84}, tempfilelist, (device-sycl)
+// CHECK: 86: file-table-tform, {83, 85}, tempfiletable, (device-sycl)
+// CHECK: 87: clang-offload-wrapper, {86}, object, (device-sycl)
+
+// CHECK: 88: linker, {15, 81}, ir, (device-sycl)
+// CHECK: 89: sycl-post-link, {88}, tempfiletable, (device-sycl)
+// CHECK: 90: file-table-tform, {89}, tempfilelist, (device-sycl)
+// CHECK: 91: llvm-spirv, {90}, tempfilelist, (device-sycl)
+// CHECK: 92: file-table-tform, {89, 91}, tempfiletable, (device-sycl)
+// CHECK: 93: clang-offload-wrapper, {92}, object, (device-sycl)
+
+// CHECK: 94: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (spir64-unknown-unknown)" {87}, "device-sycl (spir64-unknown-unknown)" {93}, image
diff --git a/clang/test/SemaSYCL/sycl-no-rdc.cpp b/clang/test/SemaSYCL/sycl-no-rdc.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only -fno-gpu-rdc -internal-isystem %S/Inputs %s
+
+// Check that declarations of SYCL_EXTERNAL functions throw an error if -fno-gpu-rdc is passed
+#include "sycl.hpp"
+
+// expected-error@+1{{invalid declaration of SYCL_EXTERNAL function in non-relocatable device code mode}}
+SYCL_EXTERNAL void syclExternalDecl();
+
+// expected-error@+1{{invalid definition of SYCL_EXTERNAL function in non-relocatable device code mode}}
+SYCL_EXTERNAL void syclExternalDefn() {}
+
+using namespace sycl;
+queue q;
+
+void kernel_wrapper() {
+  q.submit([&](handler &h) {
+    h.single_task([=] {
+    });
+  });
+}
+
+int main() {
+  kernel_wrapper();
+}
diff --git a/llvm/include/llvm/Support/SimpleTable.h b/llvm/include/llvm/Support/SimpleTable.h
@@ -132,6 +132,9 @@ class SimpleTable {
   Row &operator[](int I) { return Rows[I]; }
   const Row &operator[](int I) const { return Rows[I]; }
 
+  // Merge another table into this table
+  Error merge(const SimpleTable &Other);
+
 private:
   Error addColumnName(StringRef ColName);
   void rebuildName2NumMapping();
diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp
@@ -251,5 +251,14 @@ Expected<SimpleTable::UPtrTy> SimpleTable::read(const Twine &FileName,
   return read(MemBuf->get(), ColSep);
 }
 
+Error SimpleTable::merge(const SimpleTable &Other) {
+  if (getNumColumns() != Other.getNumColumns())
+    return makeError("different number of columns");
+  if (ColumnNames != Other.ColumnNames)
+    return makeError("different column names");
+  Rows.insert(Rows.end(), Other.Rows.begin(), Other.Rows.end());
+  return Error::success();
+}
+
 } // namespace util
 } // namespace llvm
diff --git a/llvm/test/tools/file-table-tform/Inputs/a1.txt b/llvm/test/tools/file-table-tform/Inputs/a1.txt
@@ -0,0 +1,3 @@
+[11|12]
+44|55
+66|77
diff --git a/llvm/test/tools/file-table-tform/Inputs/merge-gold.txt b/llvm/test/tools/file-table-tform/Inputs/merge-gold.txt
@@ -0,0 +1,5 @@
+[11|12]
+00|11
+22|33
+44|55
+66|77
diff --git a/llvm/test/tools/file-table-tform/Inputs/merge-input.txt b/llvm/test/tools/file-table-tform/Inputs/merge-input.txt
diff --git a/llvm/test/tools/file-table-tform/file-table-tform-merge.test b/llvm/test/tools/file-table-tform/file-table-tform-merge.test
diff --git a/llvm/tools/file-table-tform/file-table-tform.cpp b/llvm/tools/file-table-tform/file-table-tform.cpp

Original file line number	Diff line number	Diff line change
`@@ -567,6 +567,10 @@ void FileTableTformJobAction::addCopySingleFileTform(StringRef ColumnName,`
`567`	`567`	`Tform(Tform::COPY_SINGLE_FILE, {ColumnName, std::to_string(Row)}));`
`568`	`568`	`}`
`569`	`569`
	`570`	`+void FileTableTformJobAction::addMergeTform(StringRef ColumnName) {`
	`571`	`+ Tforms.emplace_back(Tform(Tform::MERGE, {ColumnName}));`
	`572`	`+}`
	`573`	`+`
`570`	`574`	`void AppendFooterJobAction::anchor() {}`
`571`	`575`
`572`	`576`	`AppendFooterJobAction::AppendFooterJobAction(Action *Input, types::ID Type)`
Original file line number	Diff line number	Diff line change
`@@ -3613,6 +3613,9 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,`
`3613`	`3613`
`3614`	`3614`	`if (!Opts.RandstructSeed.empty())`
`3615`	`3615`	`GenerateArg(Args, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed, SA);`
	`3616`	`+`
	`3617`	`+ if (!Opts.GPURelocatableDeviceCode)`
	`3618`	`+ GenerateArg(Args, OPT_fno_gpu_rdc, SA);`
`3616`	`3619`	`}`
`3617`	`3620`
`3618`	`3621`	`bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,`
`@@ -4208,6 +4211,11 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,`
`4208`	`4211`	`Diags.Report(diag::err_drv_hlsl_unsupported_target) << T.str();`
`4209`	`4212`	`}`
`4210`	`4213`
	`4214`	`+ // GPURelocatableDeviceCode should be true for SYCL if not specified.`
	`4215`	`+ if (Args.hasArg(OPT_fsycl_is_device) \|\| Args.hasArg(OPT_fsycl_is_host))`
	`4216`	`+ Opts.GPURelocatableDeviceCode = Args.hasFlag(`
	`4217`	`+ options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, /default=/true);`
	`4218`	`+`
`4211`	`4219`	`return Diags.getNumErrors() == NumErrorsBefore;`
`4212`	`4220`	`}`
`4213`	`4221`