diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py index 859aa96e50903..6141d69546596 100755 --- a/devops/scripts/benchmarks/main.py +++ b/devops/scripts/benchmarks/main.py @@ -270,14 +270,18 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): if options.output_markdown: markdown_content = generate_markdown( - this_name, chart_data, options.output_markdown + this_name, chart_data, failures, options.output_markdown ) - with open("benchmark_results.md", "w") as file: + md_path = options.output_directory + if options.output_directory is None: + md_path = os.getcwd() + + with open(os.path.join(md_path, "benchmark_results.md"), "w") as file: file.write(markdown_content) print( - f"Markdown with benchmark results has been written to {os.getcwd()}/benchmark_results.md" + f"Markdown with benchmark results has been written to {md_path}/benchmark_results.md" ) saved_name = save_name if save_name is not None else this_name @@ -381,12 +385,6 @@ def validate_and_parse_env_args(env_args): help="Regex pattern to filter benchmarks by name.", default=None, ) - parser.add_argument( - "--epsilon", - type=float, - help="Threshold to consider change of performance significant", - default=options.epsilon, - ) parser.add_argument( "--verbose", help="Print output of all the commands.", action="store_true" ) @@ -415,6 +413,12 @@ def validate_and_parse_env_args(env_args): parser.add_argument( "--output-html", help="Create HTML output", action="store_true", default=False ) + parser.add_argument( + "--output-dir", + type=str, + help="Location for output files, if --output-html or --output-markdown was specified.", + default=None, + ) parser.add_argument( "--dry-run", help="Do not run any actual benchmarks", @@ -480,7 +484,6 @@ def validate_and_parse_env_args(env_args): options.sycl = args.sycl options.iterations = args.iterations options.timeout = args.timeout - options.epsilon = args.epsilon options.ur = args.ur options.ur_adapter = args.adapter options.exit_on_failure = args.exit_on_failure @@ -503,6 +506,11 @@ def validate_and_parse_env_args(env_args): if args.compute_runtime is not None: options.build_compute_runtime = True options.compute_runtime_tag = args.compute_runtime + if args.output_dir is not None: + if not os.path.isdir(args.output_dir): + parser.error("Specified --output-dir is not a valid path") + options.output_directory = os.path.abspath(args.output_dir) + benchmark_filter = re.compile(args.filter) if args.filter else None diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 78eda7ae3c88e..de87cf19f5c5d 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -31,10 +31,9 @@ class Options: compare_max: int = 10 # average/median over how many results output_markdown: MarkdownSize = MarkdownSize.SHORT output_html: bool = False + output_directory: str = None dry_run: bool = False - # these two should probably be merged into one setting stddev_threshold: float = 0.02 - epsilon: float = 0.02 iterations_stddev: int = 5 build_compute_runtime: bool = False extra_ld_libraries: list[str] = field(default_factory=list) diff --git a/devops/scripts/benchmarks/output_markdown.py b/devops/scripts/benchmarks/output_markdown.py index 84af97fc51adb..3295968603d0c 100644 --- a/devops/scripts/benchmarks/output_markdown.py +++ b/devops/scripts/benchmarks/output_markdown.py @@ -79,7 +79,7 @@ def get_improved_regressed_summary(is_improved: bool, rows_count: int): "\n
\n" "\n" f"{title} {rows_count} " - f"(threshold {options.epsilon*100:.2f}%)\n" + f"(threshold {options.stddev_threshold*100:.2f}%)\n" "\n\n" ) @@ -138,17 +138,6 @@ def generate_markdown_details( env_dict = res.env command = res.command - # If data is collected from already saved results, - # the content is parsed as strings - if isinstance(res.env, str): - # Since the scripts would be used solely on data prepared - # by our scripts, this should be safe - # However, maybe needs an additional blessing - # https://docs.python.org/3/library/ast.html#ast.literal_eval - env_dict = ast.literal_eval(res.env) - if isinstance(res.command, str): - command = ast.literal_eval(res.command) - section = ( "\n
\n" f"{res.label}\n\n" @@ -179,7 +168,7 @@ def generate_markdown_details( return "\nBenchmark details contain too many chars to display\n" -def generate_summary_table_and_chart( +def generate_summary_table( chart_data: dict[str, list[Result]], baseline_name: str, markdown_size: MarkdownSize ): summary_table = get_chart_markdown_header( @@ -276,7 +265,7 @@ def generate_summary_table_and_chart( delta = oln.diff - 1 oln.row += f" {delta*100:.2f}%" - if abs(delta) > options.epsilon: + if abs(delta) > options.stddev_threshold: if delta > 0: improved_rows.append(oln.row + " | \n") else: @@ -374,10 +363,27 @@ def generate_summary_table_and_chart( return "\n# Summary\n" "Benchmark output is too large to display\n\n" +def generate_failures_section(failures: dict[str, str]) -> str: + if not failures: + return "" + + section = "\n# Failures\n" + section += "| Name | Failure |\n" + section += "|---|---|\n" + + for name, failure in failures.items(): + section += f"| {name} | {failure} |\n" + + return section + + def generate_markdown( - name: str, chart_data: dict[str, list[Result]], markdown_size: MarkdownSize + name: str, + chart_data: dict[str, list[Result]], + failures: dict[str, str], + markdown_size: MarkdownSize, ): - (summary_line, summary_table) = generate_summary_table_and_chart( + (summary_line, summary_table) = generate_summary_table( chart_data, name, markdown_size ) @@ -396,4 +402,6 @@ def generate_markdown( ) generated_markdown += "\n# Details\n" f"{markdown_details}\n" - return generated_markdown + failures_section = generate_failures_section(failures) + + return failures_section + generated_markdown