feat: ability to retry errors from cli (#5647)

typpo · web-flow · commit ab6ae2bb915a · 2025-09-22T06:05:31.000-07:00
diff --git a/examples/retry-testing/README.md b/examples/retry-testing/README.md
@@ -0,0 +1,3 @@
+# Retry Testing Example
+
+This example creates a test harness with a high error rate (~50%) to test the retry functionality using `promptfoo eval --retry-errors`.
diff --git a/examples/retry-testing/errorProvider.js b/examples/retry-testing/errorProvider.js
@@ -0,0 +1,42 @@
+const promptfoo = require('../../dist/src/index.js').default;
+
+class ErrorProneProvider {
+  constructor(options) {
+    this.providerId = options.id || 'error-prone-provider';
+    this.config = options.config || {};
+  }
+
+  id() {
+    return this.providerId;
+  }
+
+  async callApi(prompt) {
+    console.log(`[ErrorProneProvider] Processing prompt: ${prompt}`);
+
+    // Throw error based on probability (50% chance)
+    if (Math.random() < 0.5) {
+      throw new Error('API request failed: Simulated error for testing retry logic');
+    }
+
+    // Success cases - return a mock response
+    const responses = [
+      `Here's information about the topic you requested.`,
+      `This is a successful response for your query.`,
+      `I can help explain this concept clearly.`,
+      `This provider successfully processed your request.`,
+    ];
+
+    const randomResponse = responses[Math.floor(Math.random() * responses.length)];
+
+    return {
+      output: randomResponse,
+      tokenUsage: {
+        total: 50,
+        prompt: 20,
+        completion: 30,
+      },
+    };
+  }
+}
+
+module.exports = ErrorProneProvider;
diff --git a/examples/retry-testing/promptfooconfig.yaml b/examples/retry-testing/promptfooconfig.yaml
@@ -0,0 +1,51 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+
+description: 'Retry Testing - High Error Rate Example (Offline Mode)'
+
+prompts:
+  - 'Tell me about {{topic}}'
+
+providers:
+  - id: file://errorProvider.js
+    label: 'Error-prone provider'
+
+tests:
+  # Success cases (20% success rate)
+  - vars:
+      topic: 'success case 1'
+    assert:
+      - type: not-equals
+        value: 'SHOULD_NEVER_MATCH'
+
+  - vars:
+      topic: 'success case 2'
+    assert:
+      - type: not-equals
+        value: 'SHOULD_NEVER_MATCH'
+
+  # Error cases (80% error rate)
+  - vars:
+      topic: 'ERROR_NETWORK_FAILURE'
+
+  - vars:
+      topic: 'ERROR_API_LIMIT'
+
+  - vars:
+      topic: 'ERROR_CONNECTION_REFUSED'
+
+  - vars:
+      topic: 'ERROR_AUTHENTICATION_FAILED'
+
+  - vars:
+      topic: 'ERROR_SERVICE_UNAVAILABLE'
+
+  - vars:
+      topic: 'ERROR_TIMEOUT_EXCEEDED'
+
+  - vars:
+      topic: 'ERROR_PARSE_FAILURE'
+
+  - vars:
+      topic: 'ERROR_RATE_LIMITED'
+
+outputPath: retry-test-results-offline.json
diff --git a/site/docs/usage/command-line.md b/site/docs/usage/command-line.md
@@ -85,6 +85,7 @@ By default the `eval` command will read the `promptfooconfig.yaml` configuration
 | `--no-table`                        | Do not output table in CLI                                                    |
 | `--no-write`                        | Do not write results to promptfoo directory                                   |
 | `--resume [evalId]`                 | Resume a paused/incomplete evaluation. If `evalId` is omitted, resumes latest |
+| `--retry-errors`                    | Retry all ERROR results from the latest evaluation                            |
 | `-o, --output <paths...>`           | Path(s) to output file (csv, txt, json, jsonl, yaml, yml, html, xml)          |
 | `-p, --prompts <paths...>`          | Paths to prompt files (.txt)                                                  |
 | `--prompt-prefix <path>`            | Prefix prepended to every prompt                                              |
@@ -113,6 +114,16 @@ promptfoo eval --resume <evalId>   # resumes a specific evaluation
 
 - On resume, promptfoo reuses the original run's effective runtime options (e.g., `--delay`, `--no-cache`, `--max-concurrency`, `--repeat`), skips completed test/prompt pairs, ignores CLI flags that change test ordering to keep indices aligned, and disables watch mode.
 
+### Retry Errors
+
+```sh
+promptfoo eval --retry-errors      # retries all ERROR results from the latest evaluation
+```
+
+- The retry errors feature automatically finds ERROR results from the latest evaluation, removes them from the database, and re-runs only those test cases. This is useful when evaluations fail due to temporary network issues, rate limits, or API errors.
+- Cannot be used together with `--resume` or `--no-write` flags.
+- Uses the original evaluation's configuration and runtime options to ensure consistency.
+
 ## `promptfoo init [directory]`
 
 Initialize a new project with dummy files.
diff --git a/site/static/config-schema.json b/site/static/config-schema.json
@@ -1853,6 +1853,9 @@
             "promptSuffix": {
               "type": "string"
             },
+            "retryErrors": {
+              "type": "boolean"
+            },
             "envPath": {
               "type": "string"
             }
diff --git a/src/commands/eval.ts b/src/commands/eval.ts
@@ -32,6 +32,7 @@ import { TokenUsageTracker } from '../util/tokenUsage';
 import { accumulateTokenUsage, createEmptyTokenUsage } from '../util/tokenUsageUtils';
 import { filterProviders } from './eval/filterProviders';
 import { filterTests } from './eval/filterTests';
+import { getErrorResultIds, deleteErrorResults, recalculatePromptMetrics } from './retry';
 import { notCloudEnabledShareInstructions } from './share';
 import type { Command } from 'commander';
 
@@ -50,6 +51,7 @@ const EvalCommandSchema = CommandLineOptionsSchema.extend({
   interactiveProviders: z.boolean().optional(),
   remote: z.boolean().optional(),
   noShare: z.boolean().optional(),
+  retryErrors: z.boolean().optional(),
   // Allow --resume or --resume <id>
   // TODO(ian): Temporarily disabled to troubleshoot database corruption issues with SIGINT.
   // resume: z.union([z.string(), z.boolean()]).optional(),
@@ -153,9 +155,20 @@ export async function doEval(
       }
     }
 
+    // Check for conflicting options
+    const resumeRaw = (cmdObj as any).resume as string | boolean | undefined;
+    const retryErrors = cmdObj.retryErrors;
+
+    if (resumeRaw && retryErrors) {
+      logger.error(
+        chalk.red('Cannot use --resume and --retry-errors together. Please use one or the other.'),
+      );
+      process.exitCode = 1;
+      return new Eval({}, { persisted: false });
+    }
+
     // If resuming, load config from existing eval and avoid CLI filters that could change indices
     let resumeEval: Eval | undefined;
-    const resumeRaw = (cmdObj as any).resume as string | boolean | undefined;
     const resumeId =
       resumeRaw === true || resumeRaw === undefined ? 'latest' : (resumeRaw as string);
     if (resumeRaw) {
@@ -194,6 +207,72 @@ export async function doEval(
             }) as any,
         );
       }
+      // Mark resume mode in CLI state so evaluator can skip completed work
+      cliState.resume = true;
+    } else if (retryErrors) {
+      // Check if --no-write is set with --retry-errors
+      if (cmdObj.write === false) {
+        logger.error(
+          chalk.red(
+            'Cannot use --retry-errors with --no-write. Retry functionality requires database persistence.',
+          ),
+        );
+        process.exitCode = 1;
+        return new Eval({}, { persisted: false });
+      }
+
+      logger.info('🔄 Retrying ERROR results from latest evaluation...');
+
+      // Find the latest evaluation
+      const latestEval = await Eval.latest();
+      if (!latestEval) {
+        logger.error('No previous evaluation found to retry errors from');
+        process.exitCode = 1;
+        return new Eval({}, { persisted: false });
+      }
+
+      // Get all ERROR result IDs
+      const errorResultIds = await getErrorResultIds(latestEval.id);
+      if (errorResultIds.length === 0) {
+        logger.info('✅ No ERROR results found in the latest evaluation');
+        return latestEval;
+      }
+
+      logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
+
+      // Delete the ERROR results so they will be re-evaluated when we run with resume
+      await deleteErrorResults(errorResultIds);
+
+      // Recalculate prompt metrics after deleting ERROR results to avoid double-counting
+      await recalculatePromptMetrics(latestEval);
+
+      logger.info(
+        `🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`,
+      );
+
+      // Set up for resume mode
+      resumeEval = latestEval;
+
+      // Use the saved config as our base to ensure identical test ordering
+      ({
+        config,
+        testSuite,
+        basePath: _basePath,
+        commandLineOptions,
+      } = await resolveConfigs({}, resumeEval.config));
+
+      // Ensure prompts exactly match the previous run to preserve IDs and content
+      if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) {
+        testSuite.prompts = resumeEval.prompts.map(
+          (p) =>
+            ({
+              raw: p.raw,
+              label: p.label,
+              config: p.config,
+            }) as any,
+        );
+      }
+
       // Mark resume mode in CLI state so evaluator can skip completed work
       cliState.resume = true;
     } else {
@@ -290,7 +369,7 @@ export async function doEval(
     }
 
     // Apply filtering only when not resuming, to preserve test indices
-    if (!resumeRaw) {
+    if (!resumeEval) {
       const filterOptions: FilterOptions = {
         failing: cmdObj.filterFailing,
         errorsOnly: cmdObj.filterErrorsOnly,
@@ -313,7 +392,7 @@ export async function doEval(
       await checkEmailStatusOrExit();
     }
 
-    if (!resumeRaw) {
+    if (!resumeEval) {
       testSuite.providers = filterProviders(
         testSuite.providers,
         cmdObj.filterProviders || cmdObj.filterTargets,
@@ -338,22 +417,22 @@ export async function doEval(
       cache,
     };
 
-    if (!resumeRaw && cmdObj.grader) {
+    if (!resumeEval && cmdObj.grader) {
       if (typeof testSuite.defaultTest === 'string') {
         testSuite.defaultTest = {};
       }
       testSuite.defaultTest = testSuite.defaultTest || {};
       testSuite.defaultTest.options = testSuite.defaultTest.options || {};
       testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader);
     }
-    if (!resumeRaw && cmdObj.var) {
+    if (!resumeEval && cmdObj.var) {
       if (typeof testSuite.defaultTest === 'string') {
         testSuite.defaultTest = {};
       }
       testSuite.defaultTest = testSuite.defaultTest || {};
       testSuite.defaultTest.vars = { ...testSuite.defaultTest.vars, ...cmdObj.var };
     }
-    if (!resumeRaw && cmdObj.generateSuggestions) {
+    if (!resumeEval && cmdObj.generateSuggestions) {
       options.generateSuggestions = true;
     }
     // load scenarios or tests from an external file
@@ -696,7 +775,7 @@ export async function doEval(
       isRedteam,
     });
 
-    if (cmdObj.watch && !resumeRaw) {
+    if (cmdObj.watch && !resumeEval) {
       if (initialization) {
         const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean) as string[];
         if (!configPaths.length) {
@@ -922,6 +1001,7 @@ export function evalCommand(
       '--resume [evalId]',
       'Resume a paused/incomplete evaluation. Defaults to latest when omitted',
     )
+    .option('--retry-errors', 'Retry all ERROR results from the latest evaluation')
     .option(
       '--no-write',
       'Do not write results to promptfoo directory',
diff --git a/src/commands/retry.ts b/src/commands/retry.ts
diff --git a/src/main.ts b/src/main.ts
diff --git a/src/types/index.ts b/src/types/index.ts

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Retry Testing Example`
	`2`	`+`
	`3`	+This example creates a test harness with a high error rate (~50%) to test the retry functionality using `promptfoo eval --retry-errors`.