From 9513a169ebe6a4c5d1ad72623c82ae0535216535 Mon Sep 17 00:00:00 2001 From: Kristian Larsson Date: Tue, 5 Mar 2024 23:52:39 +0100 Subject: [PATCH] Revamp stdin_install & add env actions There are now new methods on env to interact with environment variables: - getenv - setenv - unsetenv - getenvb - setenvb - unsetenvb The first three work with str input and output, presuming data is encoded with UTF-8. getenvb and friends instead use bytes as input and output for raw access to environment variables. Some time ago stdin_install was changed from working with str to be bytes instead, which is arguably more correct. However, a bug crept in so while the returned data was indeed bytes, the type signature said str. This has been fixed by revamping stdin_install. stdin_install now support working with strings in a convenient manner just like previously, but now handles chunking of data as well as reporting errors (which were previously unhandled) as well as providing a raw bytes interface. When stdin is attached to a terminal, it uses line buffering. The buffer is however limited in size so if the line is really long, it will be chopped into chunks. If a unicode characters happens to be split so we only get the first byte of a multi-byte character, then the decode would fail. The string StringDecoder actor will attempt to handle this and store the remaining bytes in a buffer to be decoded upon receipt of the next chunk. Just like with a raw interface to stdin, it attempts to be line buffered but there is no guarantee when the line exceeds the buffer size. We attempt to automatically detect the encoding of the terminal by reading the LANG environment variable. If this is not possible, we assume UTF-8 encoding. Since we do not currently support anything besides UTF-8, any other encoding will lead to an exception (which might go unhandled). Acton-by-Example has been updated with some examples around this. --- base/builtin/env.c | 10 +-- base/src/__builtin__.act | 88 ++++++++++++++++++- base/src/__builtin__.ext.c | 51 +++++++++++ docs/acton-by-example/src/SUMMARY.md | 3 + docs/acton-by-example/src/environment.md | 4 + .../acton-by-example/src/environment/stdin.md | 39 ++++++++ .../src/environment/variables.md | 23 +++++ 7 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 docs/acton-by-example/src/environment.md create mode 100644 docs/acton-by-example/src/environment/stdin.md create mode 100644 docs/acton-by-example/src/environment/variables.md diff --git a/base/builtin/env.c b/base/builtin/env.c index 770aea3fb..bda3e3858 100644 --- a/base/builtin/env.c +++ b/base/builtin/env.c @@ -37,9 +37,8 @@ extern int return_val; printf("%s", s->str); return $R_CONT(c$cont, B_None); } -void read_stdin(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { - log_info("read_stdin: %p", stream->data); +void read_stdin(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { if (nread < 0){ if (nread == UV_EOF) { uv_close((uv_handle_t *)stream, NULL); @@ -50,11 +49,9 @@ void read_stdin(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { cb->$class->__asyn__(cb, to$bytesD_len(buf->base, nread)); } } - - if (buf->base) - acton_free(buf->base); } -$R B_EnvD_stdin_installG_local (B_Env self, $Cont c$cont, $action cb) { + +$R B_EnvD__on_stdin_bytesG_local (B_Env self, $Cont c$cont, $action cb) { // This should be the only call in env that does IO stuff, so it is safe to // pin affinity here (and not earlier).. pin_actor_affinity(); @@ -64,6 +61,7 @@ void read_stdin(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { uv_read_start((uv_stream_t*)tty, alloc_buffer, read_stdin); return $R_CONT(c$cont, B_None); } + $R B_EnvD_exitG_local (B_Env self, $Cont c$cont, B_int n) { return_val = from$int(n); rts_shutdown(); diff --git a/base/src/__builtin__.act b/base/src/__builtin__.act index 0a0d37fa2..b59e1185d 100644 --- a/base/src/__builtin__.act +++ b/base/src/__builtin__.act @@ -927,6 +927,38 @@ def type(a: value) -> str: """ NotImplemented +actor StringDecoder(cb_out: action(str) -> None, encoding: ?str="utf-8", on_error: ?action(str, bytes) -> None): + """Bytes to string decoder + + Decodes bytes to string using the provided encoding. If no encoding is given + UTF-8 is used. The decoder is stateful in order to buffer incomplete multi- + byte characters. + """ + MAX_UNICODE_CHAR_SIZE = 4 + var buf: bytes = b"" + + if encoding is not None: + if encoding.lower() != "utf-8": + raise ValueError("Only utf-8 encoding is supported") + + def decode(input: bytes) -> None: + buf += input + # Attempt to decode all of buf. If it fails we are likely in the middle + # of a multi-byte character so we try again by removing the last bytes + # iteratively until we succeed. UTF-8 has up to 4 bytes per character. + for i in range(1, MAX_UNICODE_CHAR_SIZE+1): + try: + s = buf[:-i].decode() + buf = buf[-i:] + cb_out(s) + return + except ValueError: + pass + if on_error is not None: + on_error("Invalid UTF-8", buf) + else: + raise ValueError("Invalid UTF-8: %s" % str(buf)) + ## Environment ################################################ class WorldCap(): @@ -968,10 +1000,64 @@ actor Env (wc: WorldCap, sc: SysCap, args: list[str]): argv = args nr_wthreads: int = 0 + action def getenv(name: str) -> ?str: + """Get the value of an environment variable""" + res = getenvb(name.encode()) + if res is not None: + return res.decode() + return None + + action def getenvb(name: bytes) -> ?bytes: + """Get the value of an environment variable""" + NotImplemented + + action def setenv(n: str, v: str) -> None: + """Set the value of an environment variable""" + setenvb(n.encode(), v.encode()) + + action def setenvb(n: bytes, v: bytes) -> None: + """Set the value of an environment variable""" + NotImplemented + + action def unsetenv(n: str) -> None: + """Unset an environment variable""" + unsetenvb(n.encode()) + + action def unsetenvb(n: bytes) -> None: + """Unset an environment variable""" + NotImplemented + action def stdout_write(s: str) -> None: NotImplemented - action def stdin_install(cb: action(str) -> None) -> None: + action def stdin_install(on_stdin: ?action(str) -> None, encoding: ?str=None, on_error: ?action(str, bytes) -> None, on_stdin_bytes: ?action(bytes) -> None) -> None: + if on_stdin is None and on_stdin_bytes is None: + raise ValueError("At least one of on_stdin or on_stdin_bytes must be set") + elif on_stdin_bytes is not None: + if encoding is not None: + raise ValueError("encoding must not be set when on_stdin_bytes is set, it is only used for decoding stdin bytes to string") + if on_error is not None: + raise ValueError("on_error must not be set when on_stdin_bytes is set, it is only used for decoding error when decoding stdin bytes to string") + _on_stdin_bytes(on_stdin_bytes) + elif on_stdin is not None: + if encoding is None: + # If no encoding is given, attempt to discover the encoding used + # Default to utf-8 if we're unable to discover the encoding + encoding = "utf-8" + # Read encoding from the LANG environment variable + lang_env = getenv("LANG") + if lang_env is not None: + try: + encoding = lang_env.split(".")[1].lower() + except: + pass + # If stdin is attached to a terminal, attempt to discover the + # encoding used by the terminal by inspecting the LANG environment + # variable. + sd = StringDecoder(on_stdin, encoding, on_error) + _on_stdin_bytes(sd.decode) + + action def _on_stdin_bytes(cb: action(bytes) -> None) -> None: NotImplemented action def exit(n: int): diff --git a/base/src/__builtin__.ext.c b/base/src/__builtin__.ext.c index 6c266d52b..1d8aed437 100644 --- a/base/src/__builtin__.ext.c +++ b/base/src/__builtin__.ext.c @@ -25,3 +25,54 @@ B_str B_BaseExceptionD__name (B_BaseException self) { B_str B_type(B_value a) { return to$str(unmangle_name(a->$class->$GCINFO)); } + +$R B_EnvD_getenvbG_local (B_Env self, $Cont C_cont, B_bytes name) { + // uv_os_getenv is not threadsafe but our Env actor forces serial execution + + // Try to use a small fixed size buffer + size_t len = 256; + char smallval[256]; + char *value = smallval; + + const char* env_var = fromB_bytes(name); + + // First, query the required buffer size by passing NULL as the buffer + int r = uv_os_getenv(env_var, value, &len); + if (r == UV_ENOENT) { + // The environment variable does not exist + return $R_CONT(C_cont, B_None); + } else if (r == UV_ENOBUFS) { + // Allocate the buffer and actually get the environment variable value + value = (char*)acton_malloc(len); + r = uv_os_getenv(env_var, value, &len); + } + if (r < 0) { + char *s; + asprintf(&s, "Failed to read the environment variable %s: %s", env_var, uv_strerror(r)); + $RAISE((B_BaseException)B_RuntimeErrorG_new(to$str(s))); + } + return $R_CONT(C_cont, to$bytes(value)); +} + +$R B_EnvD_setenvbG_local (B_Env self, $Cont C_cont, B_bytes name, B_bytes value) { + const char* env_var = fromB_bytes(name); + const char* env_val = fromB_bytes(value); + int r = uv_os_setenv(env_var, env_val); + if (r < 0) { + char *s; + asprintf(&s, "Failed to set the environment variable %s: %s", env_var, uv_strerror(r)); + $RAISE((B_BaseException)B_RuntimeErrorG_new(to$str(s))); + } + return $R_CONT(C_cont, B_None); +} + +$R B_EnvD_unsetenvbG_local (B_Env self, $Cont C_cont, B_bytes name) { + const char* env_var = fromB_bytes(name); + int r = uv_os_unsetenv(env_var); + if (r < 0) { + char *s; + asprintf(&s, "Failed to unset the environment variable %s: %s", env_var, uv_strerror(r)); + $RAISE((B_BaseException)B_RuntimeErrorG_new(to$str(s))); + } + return $R_CONT(C_cont, B_None); +} diff --git a/docs/acton-by-example/src/SUMMARY.md b/docs/acton-by-example/src/SUMMARY.md index 79c11555b..ed12b061e 100644 --- a/docs/acton-by-example/src/SUMMARY.md +++ b/docs/acton-by-example/src/SUMMARY.md @@ -34,6 +34,9 @@ - [Explicit types](types/explicit.md) - [Security](security.md) - [Capabilities](security/capabilities.md) +- [Environment](environment.md) + - [Environment variables](environment/variables.md) + - [Reading stdin input](environment/stdin.md) - [Standard library](stdlib.md) - [Regular Expression](stdlib/re.md) diff --git a/docs/acton-by-example/src/environment.md b/docs/acton-by-example/src/environment.md new file mode 100644 index 000000000..6edb7e12e --- /dev/null +++ b/docs/acton-by-example/src/environment.md @@ -0,0 +1,4 @@ +# Environment + +The environment of an Acton application is the outside world. Any useful application typically needs to interact with the environment in some way, like reading arguments or taking input from stdin and printing output. + diff --git a/docs/acton-by-example/src/environment/stdin.md b/docs/acton-by-example/src/environment/stdin.md new file mode 100644 index 000000000..b5634b5c8 --- /dev/null +++ b/docs/acton-by-example/src/environment/stdin.md @@ -0,0 +1,39 @@ +# Reading stdin input + +Read input from stdin by installing a handler for stdin data. The returned data is `str` +```python +actor main(env): + def interact(input): + print("Got some input:", input) + + env.stdin_install(interact) +``` + + +It is possible to specify the encoding and an on_error() callback which is invoked if there are problem with decoding the data. When encoding is not specified (default `None`), an attempt is made to discover the encoding by reading the `LANG` environment variable. If no encoding is discovered, the default is to use `utf-8`. + +```python +actor main(env): + def interact(input): + print("Got some input:", input) + + def on_stdin_error(err, data): + print("Some error with decoding the input data:", err) + print("Raw bytes data:", data) + + env.stdin_install(on_stdin=interact, encoding="utf-8", on_error=on_stdin_error) +``` + +You can read the raw data in `bytes` form by installing a bytes handler instead: + +```python +actor main(env): + def interact(bytes_input): + # Note how the input might contain parts (some bytes) of a multi-byte + # Unicode character in which case decoding will fail + print("Got some input:", bytes_input.decode()) + + env.stdin_install(on_stdin_bytes=interact) +``` + +This allows reading binary data and more explicit control over how to decode the data. diff --git a/docs/acton-by-example/src/environment/variables.md b/docs/acton-by-example/src/environment/variables.md new file mode 100644 index 000000000..798e350e1 --- /dev/null +++ b/docs/acton-by-example/src/environment/variables.md @@ -0,0 +1,23 @@ +# Environment variables + +It is possible to read, set and unset environment variables. The standard functions `env.getenv`, `env.setenv` and `env.unsetenv` all assume `str` input and output, which is a convenience based on the assumption that all data is encoded using UTF-8. POSIX systems really use binary encoding for both environment names and variables. To access the environment as bytes and handle decoding explicitly, use `env.getenvb`, `env.setenvb` and `env.unsetenvb`. + +Source: +```python +actor main(env): + env_user = env.getenv("USER") + if env_user is not None: + print("User:", env_user) + env.setenv("FOO", "bar") + env.unsetenv("LANG") + foo_env = env.getenv("FOO") + if foo_env is not None: + print("FOO:", foo_env) + env.exit(0) +``` + +Output: +```sh +User: myuser +FOO: bar +```