@@ -21,6 +21,46 @@ function getLineCountCommand(): { command: string; tool: string } {
2121 }
2222}
2323
24+ function getInvalidCommand ( ) : string {
25+ switch ( shell ) {
26+ case 'powershell' :
27+ return `Get-ChildItem | | Select-Object` ;
28+ case 'cmd' :
29+ return `dir | | findstr foo` ;
30+ case 'bash' :
31+ default :
32+ return `echo "hello" > > file` ;
33+ }
34+ }
35+
36+ function getAllowedListCommand ( ) : string {
37+ switch ( shell ) {
38+ case 'powershell' :
39+ return 'Get-ChildItem' ;
40+ case 'cmd' :
41+ return 'dir' ;
42+ case 'bash' :
43+ default :
44+ return 'ls' ;
45+ }
46+ }
47+
48+ function getDisallowedFileReadCommand ( testFile : string ) : {
49+ command : string ;
50+ tool : string ;
51+ } {
52+ const quotedPath = `"${ testFile } "` ;
53+ switch ( shell ) {
54+ case 'powershell' :
55+ return { command : `Get-Content ${ quotedPath } ` , tool : 'Get-Content' } ;
56+ case 'cmd' :
57+ return { command : `type ${ quotedPath } ` , tool : 'type' } ;
58+ case 'bash' :
59+ default :
60+ return { command : `cat ${ quotedPath } ` , tool : 'cat' } ;
61+ }
62+ }
63+
2464describe ( 'run_shell_command' , ( ) => {
2565 it ( 'should be able to run a shell command' , async ( ) => {
2666 const rig = new TestRig ( ) ;
@@ -102,8 +142,17 @@ describe('run_shell_command', () => {
102142 const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' , 15000 ) ;
103143
104144 if ( ! foundToolCall ) {
145+ const toolLogs = rig . readToolLogs ( ) . map ( ( { toolRequest } ) => ( {
146+ name : toolRequest . name ,
147+ success : toolRequest . success ,
148+ args : toolRequest . args ,
149+ } ) ) ;
105150 printDebugInfo ( rig , result , {
106151 'Found tool call' : foundToolCall ,
152+ 'Allowed tools flag' : `run_shell_command(${ tool } )` ,
153+ Prompt : prompt ,
154+ 'Tool logs' : toolLogs ,
155+ Result : result ,
107156 } ) ;
108157 }
109158
@@ -210,8 +259,17 @@ describe('run_shell_command', () => {
210259 const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' , 15000 ) ;
211260
212261 if ( ! foundToolCall ) {
262+ const toolLogs = rig . readToolLogs ( ) . map ( ( { toolRequest } ) => ( {
263+ name : toolRequest . name ,
264+ success : toolRequest . success ,
265+ args : toolRequest . args ,
266+ } ) ) ;
213267 printDebugInfo ( rig , result , {
214268 'Found tool call' : foundToolCall ,
269+ 'Allowed tools flag' : `ShellTool(${ tool } )` ,
270+ Prompt : prompt ,
271+ 'Tool logs' : toolLogs ,
272+ Result : result ,
215273 } ) ;
216274 }
217275
@@ -280,6 +338,73 @@ describe('run_shell_command', () => {
280338 }
281339 } ) ;
282340
341+ it ( 'should reject commands not on the allowlist' , async ( ) => {
342+ const rig = new TestRig ( ) ;
343+ await rig . setup ( 'should reject commands not on the allowlist' ) ;
344+
345+ const testFile = rig . createFile ( 'test.txt' , 'Disallowed command check\n' ) ;
346+ const allowedCommand = getAllowedListCommand ( ) ;
347+ const disallowed = getDisallowedFileReadCommand ( testFile ) ;
348+ const prompt =
349+ `I am testing the allowed tools configuration. ` +
350+ `Attempt to run "${ disallowed . command } " to read the contents of ${ testFile } . ` +
351+ `If the command fails because it is not permitted, respond with the single word FAIL. ` +
352+ `If it succeeds, respond with SUCCESS.` ;
353+
354+ const result = await rig . run (
355+ {
356+ stdin : prompt ,
357+ yolo : false ,
358+ } ,
359+ `--allowed-tools=run_shell_command(${ allowedCommand } )` ,
360+ ) ;
361+
362+ if ( ! result . toLowerCase ( ) . includes ( 'fail' ) ) {
363+ printDebugInfo ( rig , result , {
364+ Result : result ,
365+ AllowedCommand : allowedCommand ,
366+ DisallowedCommand : disallowed . command ,
367+ } ) ;
368+ }
369+ expect ( result ) . toContain ( 'FAIL' ) ;
370+
371+ const foundToolCall = await rig . waitForToolCall (
372+ 'run_shell_command' ,
373+ 15000 ,
374+ ( args ) => args . toLowerCase ( ) . includes ( disallowed . tool . toLowerCase ( ) ) ,
375+ ) ;
376+
377+ if ( ! foundToolCall ) {
378+ printDebugInfo ( rig , result , {
379+ 'Found tool call' : foundToolCall ,
380+ ToolLogs : rig . readToolLogs ( ) ,
381+ } ) ;
382+ }
383+ expect ( foundToolCall ) . toBe ( true ) ;
384+
385+ const toolLogs = rig
386+ . readToolLogs ( )
387+ . filter ( ( toolLog ) => toolLog . toolRequest . name === 'run_shell_command' ) ;
388+ const failureLog = toolLogs . find ( ( toolLog ) =>
389+ toolLog . toolRequest . args
390+ . toLowerCase ( )
391+ . includes ( disallowed . tool . toLowerCase ( ) ) ,
392+ ) ;
393+
394+ if ( ! failureLog || failureLog . toolRequest . success ) {
395+ printDebugInfo ( rig , result , {
396+ ToolLogs : toolLogs ,
397+ DisallowedTool : disallowed . tool ,
398+ } ) ;
399+ }
400+
401+ expect (
402+ failureLog ,
403+ 'Expected failing run_shell_command invocation' ,
404+ ) . toBeTruthy ( ) ;
405+ expect ( failureLog ! . toolRequest . success ) . toBe ( false ) ;
406+ } ) ;
407+
283408 it ( 'should allow all with "ShellTool" and other specific tools' , async ( ) => {
284409 const rig = new TestRig ( ) ;
285410 await rig . setup (
@@ -386,4 +511,53 @@ describe('run_shell_command', () => {
386511 validateModelOutput ( result , fileName , 'Platform-specific listing test' ) ;
387512 expect ( result ) . toContain ( fileName ) ;
388513 } ) ;
514+
515+ it ( 'rejects invalid shell expressions' , async ( ) => {
516+ const rig = new TestRig ( ) ;
517+ await rig . setup ( 'rejects invalid shell expressions' ) ;
518+ const invalidCommand = getInvalidCommand ( ) ;
519+ const result = await rig . run (
520+ `I am testing the error handling of the run_shell_command tool. Please attempt to run the following command, which I know has invalid syntax: \`${ invalidCommand } \`. If the command fails as expected, please return the word FAIL, otherwise return the word SUCCESS.` ,
521+ ) ;
522+ expect ( result ) . toContain ( 'FAIL' ) ;
523+
524+ const escapedInvalidCommand = JSON . stringify ( invalidCommand ) . slice ( 1 , - 1 ) ;
525+ const foundToolCall = await rig . waitForToolCall (
526+ 'run_shell_command' ,
527+ 15000 ,
528+ ( args ) =>
529+ args . toLowerCase ( ) . includes ( escapedInvalidCommand . toLowerCase ( ) ) ,
530+ ) ;
531+
532+ if ( ! foundToolCall ) {
533+ printDebugInfo ( rig , result , {
534+ 'Found tool call' : foundToolCall ,
535+ EscapedCommand : escapedInvalidCommand ,
536+ ToolLogs : rig . readToolLogs ( ) ,
537+ } ) ;
538+ }
539+ expect ( foundToolCall ) . toBe ( true ) ;
540+
541+ const toolLogs = rig
542+ . readToolLogs ( )
543+ . filter ( ( toolLog ) => toolLog . toolRequest . name === 'run_shell_command' ) ;
544+ const failureLog = toolLogs . find ( ( toolLog ) =>
545+ toolLog . toolRequest . args
546+ . toLowerCase ( )
547+ . includes ( escapedInvalidCommand . toLowerCase ( ) ) ,
548+ ) ;
549+
550+ if ( ! failureLog || failureLog . toolRequest . success ) {
551+ printDebugInfo ( rig , result , {
552+ ToolLogs : toolLogs ,
553+ EscapedCommand : escapedInvalidCommand ,
554+ } ) ;
555+ }
556+
557+ expect (
558+ failureLog ,
559+ 'Expected failing run_shell_command invocation for invalid syntax' ,
560+ ) . toBeTruthy ( ) ;
561+ expect ( failureLog ! . toolRequest . success ) . toBe ( false ) ;
562+ } ) ;
389563} ) ;
0 commit comments