Skip to content

Commit

Permalink
Add Unicode chars mode: "goawk -c" or Config.Chars=true (#243)
Browse files Browse the repository at this point in the history
* Introduce Config.Chars and "goawk -c"; use first for printf %c

* Use chars mode in index(), length(), match(), and substr()

This is based on the work done in
#83
but the default is the other way around (default = bytes mode).
  • Loading branch information
benhoyt authored Sep 18, 2024
1 parent e957268 commit 0df77ff
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 36 deletions.
5 changes: 5 additions & 0 deletions goawk.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ const (
-v var=value variable assignment (multiple allowed)
Additional GoAWK features:
-c use Unicode chars for index, length, match, substr, and %c
-E progfile load program, treat as last option, disable var=value args
-H parse header row and enable @"field" in CSV input mode
-h, --help show this help message
Expand Down Expand Up @@ -97,6 +98,7 @@ func main() {
coverMode := cover.ModeUnspecified
coverProfile := ""
coverAppend := false
useChars := false

var i int
argsLoop:
Expand Down Expand Up @@ -161,6 +163,8 @@ argsLoop:
cpuProfile = os.Args[i]
case "-csv", "--csv":
inputMode = "csv"
case "-c":
useChars = true
case "-d":
debug = true
case "-da":
Expand Down Expand Up @@ -332,6 +336,7 @@ argsLoop:
config := &interp.Config{
Argv0: filepath.Base(os.Args[0]),
Args: expandWildcardsOnWindows(args),
Chars: useChars,
NoArgVars: noArgVars,
Output: stdout,
Vars: []string{
Expand Down
6 changes: 6 additions & 0 deletions goawk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,12 @@ func TestGoAWKSpecificOptions(t *testing.T) {
{[]string{"-oxyz", `{}`}, "", "", "invalid output mode \"xyz\"\n"},
{[]string{"-H", `{}`}, "", "", "-H only allowed together with -i\n"},

// Chars mode (vs bytes)
{[]string{"-c", `BEGIN { printf "%c", 4660 }`}, "", "\u1234", ""},
{[]string{`BEGIN { printf "%c", 4660 }`}, "", "4", ""},
{[]string{"-c", `{ print length }`}, "絵\n", "1\n", ""},
{[]string{`{ print length }`}, "絵\n", "3\n", ""},

// Debug options
{[]string{"-dt", `
BEGIN { x=42; a[1]=x; print f(a, 1) }
Expand Down
66 changes: 60 additions & 6 deletions interp/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,20 +426,74 @@ func (p *interp) sprintf(format string, args []value) (string, error) {
n, isStr := a.isTrueStr()
if isStr {
s := p.toString(a)
_, size := utf8.DecodeRuneInString(s)
if size > 0 {
if len(s) == 0 {
c = []byte{0}
} else if p.chars {
_, size := utf8.DecodeRuneInString(s)
c = []byte(s[:size])
} else {
c = []byte{0}
c = []byte{s[0]}
}
} else {
c = make([]byte, utf8.UTFMax)
size := utf8.EncodeRune(c, rune(n))
c = c[:size]
if p.chars {
buf := make([]byte, utf8.UTFMax)
size := utf8.EncodeRune(buf, rune(n))
c = buf[:size]
} else {
c = []byte{byte(n)}
}
}
v = c
}
converted = append(converted, v)
}
return fmt.Sprintf(format, converted...), nil
}

func substrChars(s string, pos int) string {
// Count characters till we get to pos.
chars := 1
start := 0
for start = range s {
chars++
if chars > pos {
break
}
}
if pos >= chars {
start = len(s)
}
return s[start:]
}

func substrLengthChars(s string, pos, length int) string {
// Count characters till we get to pos.
chars := 1
start := 0
for start = range s {
chars++
if chars > pos {
break
}
}
if pos >= chars {
start = len(s)
}

// Count characters from start till we reach length.
chars = 0
end := 0
for end = range s[start:] {
chars++
if chars > length {
break
}
}
if length >= chars {
end = len(s)
} else {
end += start
}

return s[start:end]
}
6 changes: 6 additions & 0 deletions interp/interp.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ type interp struct {
regexCache map[string]*regexp.Regexp
formatCache map[string]cachedFormat
csvJoinFieldsBuf bytes.Buffer
chars bool
}

// Various const configuration. Could make these part of Config if
Expand Down Expand Up @@ -291,6 +292,10 @@ type Config struct {
//
// BEGIN { OUTPUTMODE="csv separator=|" }
CSVOutput CSVOutputConfig

// Set to true to count using Unicode chars instead of bytes for
// index(), length(), match(), substr(), and printf %c.
Chars bool
}

// IOMode specifies the input parsing or print output mode.
Expand Down Expand Up @@ -458,6 +463,7 @@ func (p *interp) setExecuteConfig(config *Config) error {
return err
}
}
p.chars = config.Chars

// After Vars has been handled, validate CSV configuration.
err := validateCSVInputConfig(p.inputMode, p.csvInputConfig)
Expand Down
60 changes: 55 additions & 5 deletions interp/interp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ NR==3, NR==5 { print NR }
{`BEGIN { printf "%.1g", 42 } # !windows-gawk`, "", "4e+01", "", ""}, // for some reason gawk gives "4e+001" on Windows
{`BEGIN { printf "%d", 12, 34 }`, "", "12", "", ""},
{`BEGIN { printf "%d" }`, "", "", "format error: got 0 args, expected 1", "not enough arg"},
// Our %c handling is mostly like awk's, except for multiples
// 256, where awk is weird, and we're like mawk
{`BEGIN { printf "%c", 0 }`, "", "\x00", "", ""},
{`BEGIN { printf "%c", 127 }`, "", "\x7f", "", ""},
{`BEGIN { printf "%c", 128 } # !windows-gawk`, "", "\u0080", "", ""},
{`BEGIN { printf "%c", 255 } # !windows-gawk`, "", "ÿ", "", ""},
{`BEGIN { printf "%c", 256 } # !windows-gawk`, "", "Ā", "", ""},
{`BEGIN { printf "%c", 4660 } # !windows-gawk`, "", "\u1234", "", ""},
{`BEGIN { printf "%c", 128 } # !gawk`, "", "\x80", "", ""},
{`BEGIN { printf "%c", 255 } # !gawk`, "", "\xff", "", ""},
{`BEGIN { printf "%c", 256 } # !gawk`, "", "\x00", "", ""},
{`BEGIN { printf "%c", "xyz" }`, "", "x", "", ""},
{`BEGIN { printf "%c %c %c", "Ā", "ĀĀĀ", "Āx" } # !windows-gawk`, "", "Ā Ā Ā", "", ""},
{`BEGIN { printf "%c", "" } # !awk`, "", "\x00", "", ""},
{`BEGIN { printf } # !awk !posix - doesn't error on this`, "", "", "parse error at 1:16: expected printf args, got none", "printf: no arguments"},
{`BEGIN { printf("%%%dd", 4) }`, "", "%4d", "", ""},
Expand Down Expand Up @@ -1532,6 +1532,56 @@ func TestConfigVarsCorrect(t *testing.T) {
}
}

func TestCharsMode(t *testing.T) {
tests := []struct {
src string
in string
out string
}{
// printf %c
{`BEGIN { printf "%c", 128 }`, "", "\u0080"},
{`BEGIN { printf "%c", 255 }`, "", "ÿ"},
{`BEGIN { printf "%c", 256 }`, "", "Ā"},
{`BEGIN { printf "%c", 4660 }`, "", "\u1234"},
{`BEGIN { printf "%c %c %c", "Ā", "ĀĀĀ", "Āx" }`, "", "Ā Ā Ā"},

// index()
{`BEGIN { print index("föö", "f"), index("föö0", 0), index("föö", "ö"), index("föö", "x") }`, "", "1 4 2 0\n"},

// length()
{`BEGIN { print length("a"), length("絵") }`, "", "1 1\n"},
{`BEGIN { $0="a"; print length(); $0 = "絵"; print length() }`, "", "1\n1\n"},

// match()
{`BEGIN { print match("絵 fööd y", /[föd]+/), RSTART, RLENGTH }`, "", "3 3 4\n"},

// substr()
{`BEGIN { print substr("food", 1), substr("fööd", 1) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 1, 2), substr("fööd", 1, 2) }`, "", "fo fö\n"},
{`BEGIN { print substr("food", 1, 4), substr("fööd", 1, 4) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 1, 8), substr("fööd", 1, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 2), substr("fööd", 2) }`, "", "ood ööd\n"},
{`BEGIN { print substr("food", 2, 2), substr("fööd", 2, 2) }`, "", "oo öö\n"},
{`BEGIN { print substr("food", 2, 3), substr("fööd", 2, 3) }`, "", "ood ööd\n"},
{`BEGIN { print substr("food", 2, 8), substr("fööd", 2, 8) }`, "", "ood ööd\n"},
{`BEGIN { print substr("food", 0, 8), substr("fööd", 0, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", -1, 8), substr("fööd", -1, 8) }`, "", "food fööd\n"},
{`BEGIN { print substr("food", 5, 8), substr("fööd", 5, 8) }`, "", " \n"},
{`BEGIN { print substr("food", 2, -3), substr("fööd", 2, -3) }`, "", " \n"},
}
for _, test := range tests {
testName := test.src
if len(testName) > 70 {
testName = testName[:70]
}
t.Run(testName, func(t *testing.T) {
testGoAWK(t, test.src, test.in, test.out, "", nil, func(config *interp.Config) {
config.Chars = true
})
})
}
}

func TestShellCommand(t *testing.T) {
testGoAWK(t, `BEGIN { system("echo hello world") }`, "", "hello world\n", "", nil, nil)

Expand Down
85 changes: 60 additions & 25 deletions interp/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"strings"
"time"
"unicode/utf8"

"github.com/benhoyt/goawk/internal/compiler"
"github.com/benhoyt/goawk/internal/resolver"
Expand Down Expand Up @@ -977,17 +978,37 @@ func (p *interp) callBuiltin(builtinOp compiler.BuiltinOp) error {
sValue, substr := p.peekPop()
s := p.toString(sValue)
index := strings.Index(s, p.toString(substr))
p.replaceTop(num(float64(index + 1)))
var awkIndex int
if index < 0 {
awkIndex = 0
} else if p.chars {
awkIndex = utf8.RuneCountInString(s[:index]) + 1
} else {
awkIndex = index + 1
}
p.replaceTop(num(float64(awkIndex)))

case compiler.BuiltinInt:
p.replaceTop(num(float64(int64(p.peekTop().num()))))

case compiler.BuiltinLength:
p.push(num(float64(len(p.line))))
var length int
if p.chars {
length = utf8.RuneCountInString(p.line)
} else {
length = len(p.line)
}
p.push(num(float64(length)))

case compiler.BuiltinLengthArg:
s := p.toString(p.peekTop())
p.replaceTop(num(float64(len(s))))
var length int
if p.chars {
length = utf8.RuneCountInString(s)
} else {
length = len(s)
}
p.replaceTop(num(float64(length)))

case compiler.BuiltinLog:
p.replaceTop(num(math.Log(p.peekTop().num())))
Expand All @@ -1003,12 +1024,14 @@ func (p *interp) callBuiltin(builtinOp compiler.BuiltinOp) error {
if loc == nil {
p.matchStart = 0
p.matchLength = -1
p.replaceTop(num(0))
} else if p.chars {
p.matchStart = utf8.RuneCountInString(s[:loc[0]]) + 1
p.matchLength = utf8.RuneCountInString(s[loc[0]:loc[1]])
} else {
p.matchStart = loc[0] + 1
p.matchLength = loc[1] - loc[0]
p.replaceTop(num(float64(p.matchStart)))
}
p.replaceTop(num(float64(p.matchStart)))

case compiler.BuiltinRand:
p.push(num(p.random.Float64()))
Expand Down Expand Up @@ -1043,34 +1066,46 @@ func (p *interp) callBuiltin(builtinOp compiler.BuiltinOp) error {
sValue, posValue := p.peekPop()
pos := int(posValue.num())
s := p.toString(sValue)
if pos > len(s) {
pos = len(s) + 1
}
if pos < 1 {
pos = 1
var substr string
if p.chars {
substr = substrChars(s, pos)
} else {
if pos > len(s) {
pos = len(s) + 1
}
if pos < 1 {
pos = 1
}
length := len(s) - pos + 1
substr = s[pos-1 : pos-1+length]
}
length := len(s) - pos + 1
p.replaceTop(str(s[pos-1 : pos-1+length]))
p.replaceTop(str(substr))

case compiler.BuiltinSubstrLength:
posValue, lengthValue := p.popTwo()
length := int(lengthValue.num())
pos := int(posValue.num())
s := p.toString(p.peekTop())
if pos > len(s) {
pos = len(s) + 1
}
if pos < 1 {
pos = 1
}
maxLength := len(s) - pos + 1
if length < 0 {
length = 0
}
if length > maxLength {
length = maxLength
var substr string
if p.chars {
substr = substrLengthChars(s, pos, length)
} else {
if pos > len(s) {
pos = len(s) + 1
}
if pos < 1 {
pos = 1
}
maxLength := len(s) - pos + 1
if length < 0 {
length = 0
}
if length > maxLength {
length = maxLength
}
substr = s[pos-1 : pos-1+length]
}
p.replaceTop(str(s[pos-1 : pos-1+length]))
p.replaceTop(str(substr))

case compiler.BuiltinSystem:
if p.noExec {
Expand Down

0 comments on commit 0df77ff

Please sign in to comment.