@@ -25,6 +25,46 @@ function getLineCountCommand(): { command: string; tool: string } {
2525 }
2626}
2727
28+ function getInvalidCommand ( ) : string {
29+ switch ( shell ) {
30+ case 'powershell' :
31+ return `Get-ChildItem | | Select-Object` ;
32+ case 'cmd' :
33+ return `dir | | findstr foo` ;
34+ case 'bash' :
35+ default :
36+ return `echo "hello" > > file` ;
37+ }
38+ }
39+
40+ function getAllowedListCommand ( ) : string {
41+ switch ( shell ) {
42+ case 'powershell' :
43+ return 'Get-ChildItem' ;
44+ case 'cmd' :
45+ return 'dir' ;
46+ case 'bash' :
47+ default :
48+ return 'ls' ;
49+ }
50+ }
51+
52+ function getDisallowedFileReadCommand ( testFile : string ) : {
53+ command : string ;
54+ tool : string ;
55+ } {
56+ const quotedPath = `"${ testFile } "` ;
57+ switch ( shell ) {
58+ case 'powershell' :
59+ return { command : `Get-Content ${ quotedPath } ` , tool : 'Get-Content' } ;
60+ case 'cmd' :
61+ return { command : `type ${ quotedPath } ` , tool : 'type' } ;
62+ case 'bash' :
63+ default :
64+ return { command : `cat ${ quotedPath } ` , tool : 'cat' } ;
65+ }
66+ }
67+
2868describe ( 'run_shell_command' , ( ) => {
2969 it ( 'should be able to run a shell command' , async ( ) => {
3070 const rig = new TestRig ( ) ;
@@ -106,8 +146,17 @@ describe('run_shell_command', () => {
106146 const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' , 15000 ) ;
107147
108148 if ( ! foundToolCall ) {
149+ const toolLogs = rig . readToolLogs ( ) . map ( ( { toolRequest } ) => ( {
150+ name : toolRequest . name ,
151+ success : toolRequest . success ,
152+ args : toolRequest . args ,
153+ } ) ) ;
109154 printDebugInfo ( rig , result , {
110155 'Found tool call' : foundToolCall ,
156+ 'Allowed tools flag' : `run_shell_command(${ tool } )` ,
157+ Prompt : prompt ,
158+ 'Tool logs' : toolLogs ,
159+ Result : result ,
111160 } ) ;
112161 }
113162
@@ -214,8 +263,17 @@ describe('run_shell_command', () => {
214263 const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' , 15000 ) ;
215264
216265 if ( ! foundToolCall ) {
266+ const toolLogs = rig . readToolLogs ( ) . map ( ( { toolRequest } ) => ( {
267+ name : toolRequest . name ,
268+ success : toolRequest . success ,
269+ args : toolRequest . args ,
270+ } ) ) ;
217271 printDebugInfo ( rig , result , {
218272 'Found tool call' : foundToolCall ,
273+ 'Allowed tools flag' : `ShellTool(${ tool } )` ,
274+ Prompt : prompt ,
275+ 'Tool logs' : toolLogs ,
276+ Result : result ,
219277 } ) ;
220278 }
221279
@@ -284,6 +342,73 @@ describe('run_shell_command', () => {
284342 }
285343 } ) ;
286344
345+ it ( 'should reject commands not on the allowlist' , async ( ) => {
346+ const rig = new TestRig ( ) ;
347+ await rig . setup ( 'should reject commands not on the allowlist' ) ;
348+
349+ const testFile = rig . createFile ( 'test.txt' , 'Disallowed command check\n' ) ;
350+ const allowedCommand = getAllowedListCommand ( ) ;
351+ const disallowed = getDisallowedFileReadCommand ( testFile ) ;
352+ const prompt =
353+ `I am testing the allowed tools configuration. ` +
354+ `Attempt to run "${ disallowed . command } " to read the contents of ${ testFile } . ` +
355+ `If the command fails because it is not permitted, respond with the single word FAIL. ` +
356+ `If it succeeds, respond with SUCCESS.` ;
357+
358+ const result = await rig . run (
359+ {
360+ stdin : prompt ,
361+ yolo : false ,
362+ } ,
363+ `--allowed-tools=run_shell_command(${ allowedCommand } )` ,
364+ ) ;
365+
366+ if ( ! result . toLowerCase ( ) . includes ( 'fail' ) ) {
367+ printDebugInfo ( rig , result , {
368+ Result : result ,
369+ AllowedCommand : allowedCommand ,
370+ DisallowedCommand : disallowed . command ,
371+ } ) ;
372+ }
373+ expect ( result ) . toContain ( 'FAIL' ) ;
374+
375+ const foundToolCall = await rig . waitForToolCall (
376+ 'run_shell_command' ,
377+ 15000 ,
378+ ( args ) => args . toLowerCase ( ) . includes ( disallowed . tool . toLowerCase ( ) ) ,
379+ ) ;
380+
381+ if ( ! foundToolCall ) {
382+ printDebugInfo ( rig , result , {
383+ 'Found tool call' : foundToolCall ,
384+ ToolLogs : rig . readToolLogs ( ) ,
385+ } ) ;
386+ }
387+ expect ( foundToolCall ) . toBe ( true ) ;
388+
389+ const toolLogs = rig
390+ . readToolLogs ( )
391+ . filter ( ( toolLog ) => toolLog . toolRequest . name === 'run_shell_command' ) ;
392+ const failureLog = toolLogs . find ( ( toolLog ) =>
393+ toolLog . toolRequest . args
394+ . toLowerCase ( )
395+ . includes ( disallowed . tool . toLowerCase ( ) ) ,
396+ ) ;
397+
398+ if ( ! failureLog || failureLog . toolRequest . success ) {
399+ printDebugInfo ( rig , result , {
400+ ToolLogs : toolLogs ,
401+ DisallowedTool : disallowed . tool ,
402+ } ) ;
403+ }
404+
405+ expect (
406+ failureLog ,
407+ 'Expected failing run_shell_command invocation' ,
408+ ) . toBeTruthy ( ) ;
409+ expect ( failureLog ! . toolRequest . success ) . toBe ( false ) ;
410+ } ) ;
411+
287412 it ( 'should allow all with "ShellTool" and other specific tools' , async ( ) => {
288413 const rig = new TestRig ( ) ;
289414 await rig . setup (
@@ -390,4 +515,53 @@ describe('run_shell_command', () => {
390515 validateModelOutput ( result , fileName , 'Platform-specific listing test' ) ;
391516 expect ( result ) . toContain ( fileName ) ;
392517 } ) ;
518+
519+ it ( 'rejects invalid shell expressions' , async ( ) => {
520+ const rig = new TestRig ( ) ;
521+ await rig . setup ( 'rejects invalid shell expressions' ) ;
522+ const invalidCommand = getInvalidCommand ( ) ;
523+ const result = await rig . run (
524+ `I am testing the error handling of the run_shell_command tool. Please attempt to run the following command, which I know has invalid syntax: \`${ invalidCommand } \`. If the command fails as expected, please return the word FAIL, otherwise return the word SUCCESS.` ,
525+ ) ;
526+ expect ( result ) . toContain ( 'FAIL' ) ;
527+
528+ const escapedInvalidCommand = JSON . stringify ( invalidCommand ) . slice ( 1 , - 1 ) ;
529+ const foundToolCall = await rig . waitForToolCall (
530+ 'run_shell_command' ,
531+ 15000 ,
532+ ( args ) =>
533+ args . toLowerCase ( ) . includes ( escapedInvalidCommand . toLowerCase ( ) ) ,
534+ ) ;
535+
536+ if ( ! foundToolCall ) {
537+ printDebugInfo ( rig , result , {
538+ 'Found tool call' : foundToolCall ,
539+ EscapedCommand : escapedInvalidCommand ,
540+ ToolLogs : rig . readToolLogs ( ) ,
541+ } ) ;
542+ }
543+ expect ( foundToolCall ) . toBe ( true ) ;
544+
545+ const toolLogs = rig
546+ . readToolLogs ( )
547+ . filter ( ( toolLog ) => toolLog . toolRequest . name === 'run_shell_command' ) ;
548+ const failureLog = toolLogs . find ( ( toolLog ) =>
549+ toolLog . toolRequest . args
550+ . toLowerCase ( )
551+ . includes ( escapedInvalidCommand . toLowerCase ( ) ) ,
552+ ) ;
553+
554+ if ( ! failureLog || failureLog . toolRequest . success ) {
555+ printDebugInfo ( rig , result , {
556+ ToolLogs : toolLogs ,
557+ EscapedCommand : escapedInvalidCommand ,
558+ } ) ;
559+ }
560+
561+ expect (
562+ failureLog ,
563+ 'Expected failing run_shell_command invocation for invalid syntax' ,
564+ ) . toBeTruthy ( ) ;
565+ expect ( failureLog ! . toolRequest . success ) . toBe ( false ) ;
566+ } ) ;
393567} ) ;
0 commit comments