Skip to content

Commit ab6ae2b

Browse files
authored
feat: ability to retry errors from cli (#5647)
1 parent c0652e5 commit ab6ae2b

File tree

9 files changed

+470
-7
lines changed

9 files changed

+470
-7
lines changed

examples/retry-testing/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Retry Testing Example
2+
3+
This example creates a test harness with a high error rate (~50%) to test the retry functionality using `promptfoo eval --retry-errors`.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
const promptfoo = require('../../dist/src/index.js').default;
2+
3+
class ErrorProneProvider {
4+
constructor(options) {
5+
this.providerId = options.id || 'error-prone-provider';
6+
this.config = options.config || {};
7+
}
8+
9+
id() {
10+
return this.providerId;
11+
}
12+
13+
async callApi(prompt) {
14+
console.log(`[ErrorProneProvider] Processing prompt: ${prompt}`);
15+
16+
// Throw error based on probability (50% chance)
17+
if (Math.random() < 0.5) {
18+
throw new Error('API request failed: Simulated error for testing retry logic');
19+
}
20+
21+
// Success cases - return a mock response
22+
const responses = [
23+
`Here's information about the topic you requested.`,
24+
`This is a successful response for your query.`,
25+
`I can help explain this concept clearly.`,
26+
`This provider successfully processed your request.`,
27+
];
28+
29+
const randomResponse = responses[Math.floor(Math.random() * responses.length)];
30+
31+
return {
32+
output: randomResponse,
33+
tokenUsage: {
34+
total: 50,
35+
prompt: 20,
36+
completion: 30,
37+
},
38+
};
39+
}
40+
}
41+
42+
module.exports = ErrorProneProvider;
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
2+
3+
description: 'Retry Testing - High Error Rate Example (Offline Mode)'
4+
5+
prompts:
6+
- 'Tell me about {{topic}}'
7+
8+
providers:
9+
- id: file://errorProvider.js
10+
label: 'Error-prone provider'
11+
12+
tests:
13+
# Success cases (20% success rate)
14+
- vars:
15+
topic: 'success case 1'
16+
assert:
17+
- type: not-equals
18+
value: 'SHOULD_NEVER_MATCH'
19+
20+
- vars:
21+
topic: 'success case 2'
22+
assert:
23+
- type: not-equals
24+
value: 'SHOULD_NEVER_MATCH'
25+
26+
# Error cases (80% error rate)
27+
- vars:
28+
topic: 'ERROR_NETWORK_FAILURE'
29+
30+
- vars:
31+
topic: 'ERROR_API_LIMIT'
32+
33+
- vars:
34+
topic: 'ERROR_CONNECTION_REFUSED'
35+
36+
- vars:
37+
topic: 'ERROR_AUTHENTICATION_FAILED'
38+
39+
- vars:
40+
topic: 'ERROR_SERVICE_UNAVAILABLE'
41+
42+
- vars:
43+
topic: 'ERROR_TIMEOUT_EXCEEDED'
44+
45+
- vars:
46+
topic: 'ERROR_PARSE_FAILURE'
47+
48+
- vars:
49+
topic: 'ERROR_RATE_LIMITED'
50+
51+
outputPath: retry-test-results-offline.json

site/docs/usage/command-line.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ By default the `eval` command will read the `promptfooconfig.yaml` configuration
8585
| `--no-table` | Do not output table in CLI |
8686
| `--no-write` | Do not write results to promptfoo directory |
8787
| `--resume [evalId]` | Resume a paused/incomplete evaluation. If `evalId` is omitted, resumes latest |
88+
| `--retry-errors` | Retry all ERROR results from the latest evaluation |
8889
| `-o, --output <paths...>` | Path(s) to output file (csv, txt, json, jsonl, yaml, yml, html, xml) |
8990
| `-p, --prompts <paths...>` | Paths to prompt files (.txt) |
9091
| `--prompt-prefix <path>` | Prefix prepended to every prompt |
@@ -113,6 +114,16 @@ promptfoo eval --resume <evalId> # resumes a specific evaluation
113114

114115
- On resume, promptfoo reuses the original run's effective runtime options (e.g., `--delay`, `--no-cache`, `--max-concurrency`, `--repeat`), skips completed test/prompt pairs, ignores CLI flags that change test ordering to keep indices aligned, and disables watch mode.
115116

117+
### Retry Errors
118+
119+
```sh
120+
promptfoo eval --retry-errors # retries all ERROR results from the latest evaluation
121+
```
122+
123+
- The retry errors feature automatically finds ERROR results from the latest evaluation, removes them from the database, and re-runs only those test cases. This is useful when evaluations fail due to temporary network issues, rate limits, or API errors.
124+
- Cannot be used together with `--resume` or `--no-write` flags.
125+
- Uses the original evaluation's configuration and runtime options to ensure consistency.
126+
116127
## `promptfoo init [directory]`
117128

118129
Initialize a new project with dummy files.

site/static/config-schema.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1853,6 +1853,9 @@
18531853
"promptSuffix": {
18541854
"type": "string"
18551855
},
1856+
"retryErrors": {
1857+
"type": "boolean"
1858+
},
18561859
"envPath": {
18571860
"type": "string"
18581861
}

src/commands/eval.ts

Lines changed: 87 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import { TokenUsageTracker } from '../util/tokenUsage';
3232
import { accumulateTokenUsage, createEmptyTokenUsage } from '../util/tokenUsageUtils';
3333
import { filterProviders } from './eval/filterProviders';
3434
import { filterTests } from './eval/filterTests';
35+
import { getErrorResultIds, deleteErrorResults, recalculatePromptMetrics } from './retry';
3536
import { notCloudEnabledShareInstructions } from './share';
3637
import type { Command } from 'commander';
3738

@@ -50,6 +51,7 @@ const EvalCommandSchema = CommandLineOptionsSchema.extend({
5051
interactiveProviders: z.boolean().optional(),
5152
remote: z.boolean().optional(),
5253
noShare: z.boolean().optional(),
54+
retryErrors: z.boolean().optional(),
5355
// Allow --resume or --resume <id>
5456
// TODO(ian): Temporarily disabled to troubleshoot database corruption issues with SIGINT.
5557
// resume: z.union([z.string(), z.boolean()]).optional(),
@@ -153,9 +155,20 @@ export async function doEval(
153155
}
154156
}
155157

158+
// Check for conflicting options
159+
const resumeRaw = (cmdObj as any).resume as string | boolean | undefined;
160+
const retryErrors = cmdObj.retryErrors;
161+
162+
if (resumeRaw && retryErrors) {
163+
logger.error(
164+
chalk.red('Cannot use --resume and --retry-errors together. Please use one or the other.'),
165+
);
166+
process.exitCode = 1;
167+
return new Eval({}, { persisted: false });
168+
}
169+
156170
// If resuming, load config from existing eval and avoid CLI filters that could change indices
157171
let resumeEval: Eval | undefined;
158-
const resumeRaw = (cmdObj as any).resume as string | boolean | undefined;
159172
const resumeId =
160173
resumeRaw === true || resumeRaw === undefined ? 'latest' : (resumeRaw as string);
161174
if (resumeRaw) {
@@ -194,6 +207,72 @@ export async function doEval(
194207
}) as any,
195208
);
196209
}
210+
// Mark resume mode in CLI state so evaluator can skip completed work
211+
cliState.resume = true;
212+
} else if (retryErrors) {
213+
// Check if --no-write is set with --retry-errors
214+
if (cmdObj.write === false) {
215+
logger.error(
216+
chalk.red(
217+
'Cannot use --retry-errors with --no-write. Retry functionality requires database persistence.',
218+
),
219+
);
220+
process.exitCode = 1;
221+
return new Eval({}, { persisted: false });
222+
}
223+
224+
logger.info('🔄 Retrying ERROR results from latest evaluation...');
225+
226+
// Find the latest evaluation
227+
const latestEval = await Eval.latest();
228+
if (!latestEval) {
229+
logger.error('No previous evaluation found to retry errors from');
230+
process.exitCode = 1;
231+
return new Eval({}, { persisted: false });
232+
}
233+
234+
// Get all ERROR result IDs
235+
const errorResultIds = await getErrorResultIds(latestEval.id);
236+
if (errorResultIds.length === 0) {
237+
logger.info('✅ No ERROR results found in the latest evaluation');
238+
return latestEval;
239+
}
240+
241+
logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
242+
243+
// Delete the ERROR results so they will be re-evaluated when we run with resume
244+
await deleteErrorResults(errorResultIds);
245+
246+
// Recalculate prompt metrics after deleting ERROR results to avoid double-counting
247+
await recalculatePromptMetrics(latestEval);
248+
249+
logger.info(
250+
`🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`,
251+
);
252+
253+
// Set up for resume mode
254+
resumeEval = latestEval;
255+
256+
// Use the saved config as our base to ensure identical test ordering
257+
({
258+
config,
259+
testSuite,
260+
basePath: _basePath,
261+
commandLineOptions,
262+
} = await resolveConfigs({}, resumeEval.config));
263+
264+
// Ensure prompts exactly match the previous run to preserve IDs and content
265+
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) {
266+
testSuite.prompts = resumeEval.prompts.map(
267+
(p) =>
268+
({
269+
raw: p.raw,
270+
label: p.label,
271+
config: p.config,
272+
}) as any,
273+
);
274+
}
275+
197276
// Mark resume mode in CLI state so evaluator can skip completed work
198277
cliState.resume = true;
199278
} else {
@@ -290,7 +369,7 @@ export async function doEval(
290369
}
291370

292371
// Apply filtering only when not resuming, to preserve test indices
293-
if (!resumeRaw) {
372+
if (!resumeEval) {
294373
const filterOptions: FilterOptions = {
295374
failing: cmdObj.filterFailing,
296375
errorsOnly: cmdObj.filterErrorsOnly,
@@ -313,7 +392,7 @@ export async function doEval(
313392
await checkEmailStatusOrExit();
314393
}
315394

316-
if (!resumeRaw) {
395+
if (!resumeEval) {
317396
testSuite.providers = filterProviders(
318397
testSuite.providers,
319398
cmdObj.filterProviders || cmdObj.filterTargets,
@@ -338,22 +417,22 @@ export async function doEval(
338417
cache,
339418
};
340419

341-
if (!resumeRaw && cmdObj.grader) {
420+
if (!resumeEval && cmdObj.grader) {
342421
if (typeof testSuite.defaultTest === 'string') {
343422
testSuite.defaultTest = {};
344423
}
345424
testSuite.defaultTest = testSuite.defaultTest || {};
346425
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
347426
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader);
348427
}
349-
if (!resumeRaw && cmdObj.var) {
428+
if (!resumeEval && cmdObj.var) {
350429
if (typeof testSuite.defaultTest === 'string') {
351430
testSuite.defaultTest = {};
352431
}
353432
testSuite.defaultTest = testSuite.defaultTest || {};
354433
testSuite.defaultTest.vars = { ...testSuite.defaultTest.vars, ...cmdObj.var };
355434
}
356-
if (!resumeRaw && cmdObj.generateSuggestions) {
435+
if (!resumeEval && cmdObj.generateSuggestions) {
357436
options.generateSuggestions = true;
358437
}
359438
// load scenarios or tests from an external file
@@ -696,7 +775,7 @@ export async function doEval(
696775
isRedteam,
697776
});
698777

699-
if (cmdObj.watch && !resumeRaw) {
778+
if (cmdObj.watch && !resumeEval) {
700779
if (initialization) {
701780
const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean) as string[];
702781
if (!configPaths.length) {
@@ -922,6 +1001,7 @@ export function evalCommand(
9221001
'--resume [evalId]',
9231002
'Resume a paused/incomplete evaluation. Defaults to latest when omitted',
9241003
)
1004+
.option('--retry-errors', 'Retry all ERROR results from the latest evaluation')
9251005
.option(
9261006
'--no-write',
9271007
'Do not write results to promptfoo directory',

0 commit comments

Comments
 (0)