-
Notifications
You must be signed in to change notification settings - Fork 19
Reimagined: Correctly handle lengths of serialized strings #43
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
622e13c
8d40ea3
6c08440
075f58f
331f65d
53b2087
a960aaa
6c3f03c
77695d9
17a5a0a
9a52030
1652adf
dff5ac8
03c2e62
3e90a3b
eeeaa7a
cbb2f61
c8f0432
a0f60b5
1e23e66
399f397
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,9 @@ test: | |
| go test -v ./... | ||
| go test -bench . | ||
|
|
||
| bench: | ||
| go test -bench . | ||
|
|
||
| clean: | ||
| rm -rf ${BUILDDIR} | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| module github.com/Automattic/go-search-replace | ||
|
|
||
| go 1.16 | ||
| go 1.23.2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,12 @@ type Replacement struct { | |
| To []byte | ||
| } | ||
|
|
||
| type SerializedReplaceResult struct { | ||
| Pre []byte | ||
| SerializedPortion []byte | ||
| Post []byte | ||
| } | ||
|
|
||
| func main() { | ||
| versionFlag := flag.Bool("version", false, "Show version information") | ||
| flag.Parse() | ||
|
|
@@ -113,7 +119,7 @@ func main() { | |
|
|
||
| go func(line *[]byte) { | ||
| defer wg.Done() | ||
| line = replaceAndFix(line, replacements) | ||
| line = fixLine(line, replacements) | ||
| ch <- *line | ||
| }(&line) | ||
| } | ||
|
|
@@ -129,6 +135,223 @@ func main() { | |
| } | ||
| } | ||
|
|
||
| var debugMode = false | ||
|
|
||
| func Debugf(format string, args ...interface{}) { | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if debugMode { | ||
| fmt.Printf(format, args...) | ||
| } | ||
| } | ||
|
|
||
| func fixLine(line *[]byte, replacements []*Replacement) *[]byte { | ||
|
|
||
| Debugf("Doing global replacements: %s\n", string(*line)) | ||
|
|
||
| linePart := *line | ||
|
|
||
| var rebuiltLine []byte | ||
|
|
||
| for len(linePart) > 0 { | ||
| result, err := fixLineWithSerializedData(linePart, replacements) | ||
| if err != nil { | ||
| Debugf("Error when trying to fix line : %s\n", err.Error()) | ||
| rebuiltLine = append(rebuiltLine, linePart...) | ||
| break | ||
| } | ||
| rebuiltLine = append(rebuiltLine, result.Pre...) | ||
| rebuiltLine = append(rebuiltLine, result.SerializedPortion...) | ||
| linePart = result.Post | ||
| } | ||
|
|
||
| *line = rebuiltLine | ||
|
|
||
| Debugf("All done: %s\n", string(*line)) | ||
|
|
||
| return line | ||
| } | ||
|
|
||
| func replaceByPart(part []byte, replacements []*Replacement) []byte { | ||
| for _, replacement := range replacements { | ||
| part = bytes.ReplaceAll(part, replacement.From, replacement.To) | ||
| } | ||
| return part | ||
| } | ||
|
|
||
| var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):\\"`) | ||
|
|
||
| func fixLineWithSerializedData(linePart []byte, replacements []*Replacement) (*SerializedReplaceResult, error) { | ||
|
|
||
| // find starting point in the line | ||
| //TODO: We should first check if we found the string when inside a quote or not. | ||
| // but currently skipping that scenario because it seems unlikely to find it outside. | ||
|
||
| match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart) | ||
| if match == nil { | ||
| return &SerializedReplaceResult{ | ||
| Pre: replaceByPart(linePart, replacements), | ||
| SerializedPortion: []byte{}, | ||
| Post: []byte{}, | ||
| }, nil | ||
| } | ||
|
|
||
| pre := append([]byte{}, linePart[:match[0]]...) | ||
|
|
||
| pre = replaceByPart(pre, replacements) | ||
|
|
||
| if pre == nil { | ||
| pre = []byte{} | ||
| } | ||
|
|
||
| originalBytes := linePart[match[2]:match[3]] | ||
|
|
||
| originalByteSize, _ := strconv.Atoi(string(originalBytes)) | ||
|
|
||
| // the following assumes escaped double quotes | ||
| //TODO: MySQL can optionally not escape the double quote, | ||
| // but generally sqldumps always include the quotes. | ||
| contentStartIndex := match[3] + 3 | ||
|
|
||
| currentContentIndex := contentStartIndex | ||
|
|
||
| contentByteCount := 0 | ||
|
|
||
| contentEndIndex := 0 | ||
|
|
||
| var nextSliceIndex int | ||
|
|
||
| backslash := byte('\\') | ||
| semicolon := byte(';') | ||
| quote := byte('"') | ||
| nextSliceFound := false | ||
|
|
||
| maxIndex := len(linePart) - 1 | ||
|
|
||
| // let's find where the content actually ends. | ||
| // it should end when the unescaped value is `";` | ||
| for currentContentIndex < len(linePart) { | ||
| if currentContentIndex+2 > maxIndex { | ||
|
|
||
| // this algorithm SHOULD work, but in cases where the original byte count does not match | ||
| // the actual byte count, it'll error out. We'll add this safeguard here. | ||
| return nil, fmt.Errorf("faulty data, byte count does not match data size") | ||
| } | ||
| char := linePart[currentContentIndex] | ||
| secondChar := linePart[currentContentIndex+1] | ||
| thirdChar := linePart[currentContentIndex+2] | ||
| if char == backslash && contentByteCount < originalByteSize { | ||
| unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2]) | ||
| // if we get the byte pair without the backslash, it corresponds to a byte | ||
| contentByteCount += len(unescapedBytePair) | ||
|
|
||
| // content index count remains the same. | ||
| currentContentIndex += 2 | ||
| continue | ||
| } | ||
|
|
||
| if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize { | ||
|
|
||
| // we're at backslash | ||
|
|
||
| // index of the beginning of the next slice | ||
| nextSliceIndex = currentContentIndex + 3 | ||
| // we're at backslash, so we need to minus 1 to get the index where the content finishes | ||
| contentEndIndex = currentContentIndex - 1 | ||
| nextSliceFound = true | ||
| break | ||
| } | ||
|
|
||
| if contentByteCount > originalByteSize { | ||
| return nil, fmt.Errorf("faulty data, byte count does not match data size") | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| contentByteCount++ | ||
| currentContentIndex++ | ||
| } | ||
|
|
||
| content := append([]byte{}, linePart[contentStartIndex:contentEndIndex+1]...) | ||
|
|
||
| content = replaceByPart(content, replacements) | ||
|
|
||
| contentLength := len(unescapeContent(content)) | ||
|
|
||
| // and we rebuild the string | ||
| rebuiltSerializedString := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(content) + "\\\";" | ||
|
|
||
| if nextSliceFound == false { | ||
| return nil, fmt.Errorf("end of serialized string not found") | ||
| } | ||
|
|
||
| result := SerializedReplaceResult{ | ||
| Pre: pre, | ||
| SerializedPortion: []byte(rebuiltSerializedString), | ||
| Post: linePart[nextSliceIndex:], | ||
| } | ||
|
|
||
| return &result, nil | ||
| } | ||
|
|
||
| func getUnescapedBytesIfEscaped(charPair []byte) []byte { | ||
|
|
||
| backslash := byte('\\') | ||
|
|
||
| // if the first byte is not a backslash, we don't need to do anything - we'll return the bytes | ||
| // as per the function name, we'll return both bytes, or return one byte if one byte is actually an escape character | ||
| if charPair[0] != backslash { | ||
| return charPair | ||
| } | ||
|
|
||
| unescapedMap := map[byte]byte{ | ||
| '\\': '\\', | ||
| '\'': '\'', | ||
| '"': '"', | ||
| 'n': '\n', | ||
| 'r': '\r', | ||
| 't': '\t', | ||
| 'b': '\b', | ||
| 'f': '\f', | ||
| '0': '\x00', | ||
| } | ||
abdullah-kasim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| actualByte := unescapedMap[charPair[1]] | ||
|
|
||
| if actualByte != 0 { | ||
| return []byte{actualByte} | ||
| } | ||
|
|
||
| // what if it's not a valid escape? Do nothing - it's considered as already escaped | ||
| return charPair | ||
mjangda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func unescapeContent(escaped []byte) []byte { | ||
| unescapedBytes := make([]byte, 0, len(escaped)) | ||
| index := 0 | ||
|
|
||
| // only applies to content of a string - do not apply to raw mysql query | ||
| // tested with php -i, mysql client, and mysqldump and mydumper. | ||
| // 1. mysql translates certain bytes to `\<char>` i.e. `\n`. So these needs unescaping to get the correct byte length. See `getUnescapedBytesIfEscaped` | ||
| // 2. PHP serialize does not convert raw bytes into `\<char>` - they're as-is, so we don't need to take into account of escaped value in byte length calculation. | ||
|
|
||
| backslash := byte('\\') | ||
|
|
||
| for index < len(escaped) { | ||
|
|
||
| if escaped[index] == backslash { | ||
| unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2]) | ||
| byteLength := len(unescapedBytePair) | ||
|
|
||
| if byteLength == 1 { | ||
| unescapedBytes = append(unescapedBytes, unescapedBytePair...) | ||
| index = index + 2 | ||
| continue | ||
| } | ||
| } | ||
|
|
||
| unescapedBytes = append(unescapedBytes, escaped[index]) | ||
| index++ | ||
| } | ||
|
|
||
| return unescapedBytes | ||
| } | ||
|
|
||
| func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte { | ||
| for _, replacement := range replacements { | ||
| if !bytes.Contains(*line, replacement.From) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.