@@ -32,6 +32,7 @@ import { TokenUsageTracker } from '../util/tokenUsage';
32
32
import { accumulateTokenUsage , createEmptyTokenUsage } from '../util/tokenUsageUtils' ;
33
33
import { filterProviders } from './eval/filterProviders' ;
34
34
import { filterTests } from './eval/filterTests' ;
35
+ import { getErrorResultIds , deleteErrorResults , recalculatePromptMetrics } from './retry' ;
35
36
import { notCloudEnabledShareInstructions } from './share' ;
36
37
import type { Command } from 'commander' ;
37
38
@@ -50,6 +51,7 @@ const EvalCommandSchema = CommandLineOptionsSchema.extend({
50
51
interactiveProviders : z . boolean ( ) . optional ( ) ,
51
52
remote : z . boolean ( ) . optional ( ) ,
52
53
noShare : z . boolean ( ) . optional ( ) ,
54
+ retryErrors : z . boolean ( ) . optional ( ) ,
53
55
// Allow --resume or --resume <id>
54
56
// TODO(ian): Temporarily disabled to troubleshoot database corruption issues with SIGINT.
55
57
// resume: z.union([z.string(), z.boolean()]).optional(),
@@ -153,9 +155,20 @@ export async function doEval(
153
155
}
154
156
}
155
157
158
+ // Check for conflicting options
159
+ const resumeRaw = ( cmdObj as any ) . resume as string | boolean | undefined ;
160
+ const retryErrors = cmdObj . retryErrors ;
161
+
162
+ if ( resumeRaw && retryErrors ) {
163
+ logger . error (
164
+ chalk . red ( 'Cannot use --resume and --retry-errors together. Please use one or the other.' ) ,
165
+ ) ;
166
+ process . exitCode = 1 ;
167
+ return new Eval ( { } , { persisted : false } ) ;
168
+ }
169
+
156
170
// If resuming, load config from existing eval and avoid CLI filters that could change indices
157
171
let resumeEval : Eval | undefined ;
158
- const resumeRaw = ( cmdObj as any ) . resume as string | boolean | undefined ;
159
172
const resumeId =
160
173
resumeRaw === true || resumeRaw === undefined ? 'latest' : ( resumeRaw as string ) ;
161
174
if ( resumeRaw ) {
@@ -194,6 +207,72 @@ export async function doEval(
194
207
} ) as any ,
195
208
) ;
196
209
}
210
+ // Mark resume mode in CLI state so evaluator can skip completed work
211
+ cliState . resume = true ;
212
+ } else if ( retryErrors ) {
213
+ // Check if --no-write is set with --retry-errors
214
+ if ( cmdObj . write === false ) {
215
+ logger . error (
216
+ chalk . red (
217
+ 'Cannot use --retry-errors with --no-write. Retry functionality requires database persistence.' ,
218
+ ) ,
219
+ ) ;
220
+ process . exitCode = 1 ;
221
+ return new Eval ( { } , { persisted : false } ) ;
222
+ }
223
+
224
+ logger . info ( '🔄 Retrying ERROR results from latest evaluation...' ) ;
225
+
226
+ // Find the latest evaluation
227
+ const latestEval = await Eval . latest ( ) ;
228
+ if ( ! latestEval ) {
229
+ logger . error ( 'No previous evaluation found to retry errors from' ) ;
230
+ process . exitCode = 1 ;
231
+ return new Eval ( { } , { persisted : false } ) ;
232
+ }
233
+
234
+ // Get all ERROR result IDs
235
+ const errorResultIds = await getErrorResultIds ( latestEval . id ) ;
236
+ if ( errorResultIds . length === 0 ) {
237
+ logger . info ( '✅ No ERROR results found in the latest evaluation' ) ;
238
+ return latestEval ;
239
+ }
240
+
241
+ logger . info ( `Found ${ errorResultIds . length } ERROR results to retry` ) ;
242
+
243
+ // Delete the ERROR results so they will be re-evaluated when we run with resume
244
+ await deleteErrorResults ( errorResultIds ) ;
245
+
246
+ // Recalculate prompt metrics after deleting ERROR results to avoid double-counting
247
+ await recalculatePromptMetrics ( latestEval ) ;
248
+
249
+ logger . info (
250
+ `🔄 Running evaluation with resume mode to retry ${ errorResultIds . length } test cases...` ,
251
+ ) ;
252
+
253
+ // Set up for resume mode
254
+ resumeEval = latestEval ;
255
+
256
+ // Use the saved config as our base to ensure identical test ordering
257
+ ( {
258
+ config,
259
+ testSuite,
260
+ basePath : _basePath ,
261
+ commandLineOptions,
262
+ } = await resolveConfigs ( { } , resumeEval . config ) ) ;
263
+
264
+ // Ensure prompts exactly match the previous run to preserve IDs and content
265
+ if ( Array . isArray ( resumeEval . prompts ) && resumeEval . prompts . length > 0 ) {
266
+ testSuite . prompts = resumeEval . prompts . map (
267
+ ( p ) =>
268
+ ( {
269
+ raw : p . raw ,
270
+ label : p . label ,
271
+ config : p . config ,
272
+ } ) as any ,
273
+ ) ;
274
+ }
275
+
197
276
// Mark resume mode in CLI state so evaluator can skip completed work
198
277
cliState . resume = true ;
199
278
} else {
@@ -290,7 +369,7 @@ export async function doEval(
290
369
}
291
370
292
371
// Apply filtering only when not resuming, to preserve test indices
293
- if ( ! resumeRaw ) {
372
+ if ( ! resumeEval ) {
294
373
const filterOptions : FilterOptions = {
295
374
failing : cmdObj . filterFailing ,
296
375
errorsOnly : cmdObj . filterErrorsOnly ,
@@ -313,7 +392,7 @@ export async function doEval(
313
392
await checkEmailStatusOrExit ( ) ;
314
393
}
315
394
316
- if ( ! resumeRaw ) {
395
+ if ( ! resumeEval ) {
317
396
testSuite . providers = filterProviders (
318
397
testSuite . providers ,
319
398
cmdObj . filterProviders || cmdObj . filterTargets ,
@@ -338,22 +417,22 @@ export async function doEval(
338
417
cache,
339
418
} ;
340
419
341
- if ( ! resumeRaw && cmdObj . grader ) {
420
+ if ( ! resumeEval && cmdObj . grader ) {
342
421
if ( typeof testSuite . defaultTest === 'string' ) {
343
422
testSuite . defaultTest = { } ;
344
423
}
345
424
testSuite . defaultTest = testSuite . defaultTest || { } ;
346
425
testSuite . defaultTest . options = testSuite . defaultTest . options || { } ;
347
426
testSuite . defaultTest . options . provider = await loadApiProvider ( cmdObj . grader ) ;
348
427
}
349
- if ( ! resumeRaw && cmdObj . var ) {
428
+ if ( ! resumeEval && cmdObj . var ) {
350
429
if ( typeof testSuite . defaultTest === 'string' ) {
351
430
testSuite . defaultTest = { } ;
352
431
}
353
432
testSuite . defaultTest = testSuite . defaultTest || { } ;
354
433
testSuite . defaultTest . vars = { ...testSuite . defaultTest . vars , ...cmdObj . var } ;
355
434
}
356
- if ( ! resumeRaw && cmdObj . generateSuggestions ) {
435
+ if ( ! resumeEval && cmdObj . generateSuggestions ) {
357
436
options . generateSuggestions = true ;
358
437
}
359
438
// load scenarios or tests from an external file
@@ -696,7 +775,7 @@ export async function doEval(
696
775
isRedteam,
697
776
} ) ;
698
777
699
- if ( cmdObj . watch && ! resumeRaw ) {
778
+ if ( cmdObj . watch && ! resumeEval ) {
700
779
if ( initialization ) {
701
780
const configPaths = ( cmdObj . config || [ defaultConfigPath ] ) . filter ( Boolean ) as string [ ] ;
702
781
if ( ! configPaths . length ) {
@@ -922,6 +1001,7 @@ export function evalCommand(
922
1001
'--resume [evalId]' ,
923
1002
'Resume a paused/incomplete evaluation. Defaults to latest when omitted' ,
924
1003
)
1004
+ . option ( '--retry-errors' , 'Retry all ERROR results from the latest evaluation' )
925
1005
. option (
926
1006
'--no-write' ,
927
1007
'Do not write results to promptfoo directory' ,
0 commit comments