|
| 1 | +""" |
| 2 | +
|
| 3 | +A Bit-Shooter Game played on a 1-dimensional binary screen. |
| 4 | +
|
| 5 | +Given an LLM Agent access to a 1-dimensional "screen" represented |
| 6 | +as a string of bits (0s and 1s), e.g. "101010", |
| 7 | +and equip it with a "Click tool" (like a mouse click) that allows it to |
| 8 | +click on a bit -- clicking the bit causes it to flip. |
| 9 | +
|
| 10 | +The Agent plays a "Bit Shooter" game where the goal is to get rid of all |
| 11 | +1s in the "screen". |
| 12 | +
|
| 13 | +To use the Click tool, the Agent must specify the position (zero-based) |
| 14 | +where it wants to click. This causes the bit to flip. |
| 15 | +The LLM is then presented with the new state of the screen, |
| 16 | +and the process repeats until all 1s are gone. |
| 17 | +
|
| 18 | +Clearly the Agent (LLM) needs to be able to accurately count the bit positions, |
| 19 | +to be able to correctly click on the 1s. |
| 20 | +
|
| 21 | +Run like this (--model is optional, defaults to GPT4o): |
| 22 | +
|
| 23 | +python3 examples/basic/1d-screen-click.py --model litellm/anthropic/claude-3-5-sonnet-20241022 |
| 24 | +
|
| 25 | +At the beginning you get to specify the initial state of the screen: |
| 26 | +- size of the screen (how many bits) |
| 27 | +- the (0-based) locations of the 1s (SPACE-separated) in the screen. |
| 28 | +
|
| 29 | +E.g. try this: |
| 30 | +- size = 50, |
| 31 | +- 1-indices: 0 20 30 40 |
| 32 | +
|
| 33 | +The loop is set to run in interactive mode (to prevent runaway loops), |
| 34 | +so you have to keep hitting enter to see the LLM's next move. |
| 35 | +
|
| 36 | +The main observation is that when you run it with claude-3.5-sonnet, |
| 37 | +the accuracy of the Agent's clicks is far superior to other LLMs like GPT-4o |
| 38 | +and even GPT-4. |
| 39 | +
|
| 40 | +To try with other LLMs, you can set the --model param to, for example: |
| 41 | +- gpt-4 (set OPENAI_API_KEY in your env or .env file) |
| 42 | +- gpt-4o (ditto, set OPENAI_API_KEY) |
| 43 | +- groq/llama-3.1-70b-versatile (set GROQ_API_KEY in your env or .env file) |
| 44 | +- cerebras/llama3.1-70b (set CEREBRAS_API_KEY in your env or .env file) |
| 45 | +- ollama/qwen2.5-coder:latest |
| 46 | +
|
| 47 | +See here for a full guide on local/open LLM setup with Langroid: |
| 48 | +https://langroid.github.io/langroid/tutorials/local-llm-setup/ |
| 49 | +And here for how to use with other non-OpenAPI LLMs: |
| 50 | +https://langroid.github.io/langroid/tutorials/non-openai-llms/ |
| 51 | +""" |
| 52 | + |
| 53 | +from typing import List, Tuple |
| 54 | + |
| 55 | +from mypy.dmypy.client import show_stats |
| 56 | + |
| 57 | +import langroid as lr |
| 58 | +import langroid.language_models as lm |
| 59 | +from langroid.agent.tools.orchestration import AgentDoneTool |
| 60 | +from langroid.pydantic_v1 import BaseModel |
| 61 | +from langroid.utils.globals import GlobalState |
| 62 | +from rich.prompt import Prompt |
| 63 | +import fire |
| 64 | + |
| 65 | + |
| 66 | +class ScreenState(BaseModel): |
| 67 | + """ |
| 68 | + Represents the state of the 1-dimensional binary screen |
| 69 | + """ |
| 70 | + |
| 71 | + screen: str | None = None # binary string, e.g. "101010" |
| 72 | + |
| 73 | + def __init__( |
| 74 | + self, |
| 75 | + one_indices: List[int] = [1], |
| 76 | + size: int = 1, |
| 77 | + ): |
| 78 | + super().__init__() |
| 79 | + # Initialize with all zeros |
| 80 | + screen_list = ["0"] * size |
| 81 | + |
| 82 | + # Set 1s at specified indices |
| 83 | + for idx in one_indices: |
| 84 | + if 0 <= idx < size: |
| 85 | + screen_list[idx] = "1" |
| 86 | + |
| 87 | + # Join into string |
| 88 | + self.screen = "".join(screen_list) |
| 89 | + |
| 90 | + @classmethod |
| 91 | + def set_state( |
| 92 | + cls, |
| 93 | + one_indices: List[int], |
| 94 | + size: int, |
| 95 | + ) -> "ScreenState": |
| 96 | + """ |
| 97 | + Factory method to create and set initial state. |
| 98 | + """ |
| 99 | + initial_state = cls( |
| 100 | + one_indices=one_indices, |
| 101 | + size=size, |
| 102 | + ) |
| 103 | + GlobalScreenState.set_values(state=initial_state) |
| 104 | + |
| 105 | + def flip(self, i: int): |
| 106 | + """ |
| 107 | + Flip the i-th bit |
| 108 | + """ |
| 109 | + if self.screen is None or i < 0 or i >= len(self.screen): |
| 110 | + return |
| 111 | + |
| 112 | + screen_list = list(self.screen) |
| 113 | + screen_list[i] = "1" if screen_list[i] == "0" else "0" |
| 114 | + self.screen = "".join(screen_list) |
| 115 | + |
| 116 | + |
| 117 | +class GlobalScreenState(GlobalState): |
| 118 | + state: ScreenState = ScreenState() |
| 119 | + |
| 120 | + |
| 121 | +def get_state() -> ScreenState: |
| 122 | + return GlobalScreenState.get_value("state") |
| 123 | + |
| 124 | + |
| 125 | +class ClickTool(lr.ToolMessage): |
| 126 | + request: str = "click_tool" |
| 127 | + purpose: str = """ |
| 128 | + To click at <position> on the 1-dimensional binary screen, |
| 129 | + which causes the bit at that position to FLIP. |
| 130 | + IMPORTANT: the position numbering starts from 0!!! |
| 131 | + """ |
| 132 | + |
| 133 | + position: int |
| 134 | + |
| 135 | + @classmethod |
| 136 | + def examples(cls) -> List[lr.ToolMessage | Tuple[str, lr.ToolMessage]]: |
| 137 | + return [ |
| 138 | + cls(position=3), |
| 139 | + ( |
| 140 | + "I want to click at position 5", |
| 141 | + cls(position=5), |
| 142 | + ), |
| 143 | + ] |
| 144 | + |
| 145 | + def handle(self) -> str | AgentDoneTool: |
| 146 | + state = get_state() |
| 147 | + state.flip(self.position) |
| 148 | + print("SCREEN STATE = ", state.screen) |
| 149 | + if "1" not in state.screen: |
| 150 | + return AgentDoneTool() |
| 151 | + return state.screen |
| 152 | + |
| 153 | + |
| 154 | +def main(model: str = ""): |
| 155 | + llm_config = lm.OpenAIGPTConfig( |
| 156 | + chat_model=model or lm.OpenAIChatModel.GPT4o, |
| 157 | + ) |
| 158 | + click_tool_name = ClickTool.default_value("request") |
| 159 | + agent = lr.ChatAgent( |
| 160 | + lr.ChatAgentConfig( |
| 161 | + name="Clicker", |
| 162 | + llm=llm_config, |
| 163 | + use_functions_api=False, |
| 164 | + use_tools=True, |
| 165 | + show_stats=False, |
| 166 | + system_message=f""" |
| 167 | + You are an expert at COMPUTER USE. |
| 168 | + In this task you only have to be able to understand a 1-dimensional |
| 169 | + screen presented to you as a string of bits (0s and 1s). |
| 170 | + You will play a 1-dimensional BIT-shooter game! |
| 171 | + |
| 172 | + Your task is to CLICK ON THE LEFTMOST 1 in the bit-string, |
| 173 | + to flip it to a 0. |
| 174 | + |
| 175 | + Always try to click on the LEFTMOST 1 in the bit-sequence. |
| 176 | + |
| 177 | + To CLICK on the screen you |
| 178 | + must use the TOOL `{click_tool_name}` where the |
| 179 | + `position` field specifies the position (zero-based) to click. |
| 180 | + If you CORRECTLY click on a 1, the bit at that position will be |
| 181 | + turned to 0. |
| 182 | + But if you click on a 0, it will turn into a 1, |
| 183 | + taking you further from your goal. |
| 184 | + |
| 185 | + So you MUST ACCURATELY specify the position of the LEFTMOST 1 to click, |
| 186 | + making SURE there is a 1 at that position. |
| 187 | + In other words, it is critical that you are able to ACCURATELY COUNT |
| 188 | + the bit positions so that you are able to correctly identify the position |
| 189 | + of the LEFTMOST 1 bit in the "screen" given to you as a string of bits. |
| 190 | + """, |
| 191 | + ) |
| 192 | + ) |
| 193 | + |
| 194 | + agent.enable_message(ClickTool) |
| 195 | + |
| 196 | + task = lr.Task(agent, interactive=True, only_user_quits_root=False) |
| 197 | + |
| 198 | + # kick it off with initial screen state (set below by user) |
| 199 | + task.run(get_state()) |
| 200 | + |
| 201 | + |
| 202 | +if __name__ == "__main__": |
| 203 | + size = int(Prompt.ask("Size of screen (how many bits)")) |
| 204 | + ones = Prompt.ask("Indices of 1s (SPACE-separated)").split(" ") |
| 205 | + ones = [int(x) for x in ones] |
| 206 | + ScreenState.set_state(ones, size) |
| 207 | + print("SCREEN STATE = ", get_state().screen) |
| 208 | + fire.Fire(main) |
0 commit comments