-
Notifications
You must be signed in to change notification settings - Fork 19
Reimagined: Correctly handle lengths of serialized strings #43
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
622e13c
8d40ea3
6c08440
075f58f
331f65d
53b2087
a960aaa
6c3f03c
77695d9
17a5a0a
9a52030
1652adf
dff5ac8
03c2e62
3e90a3b
eeeaa7a
cbb2f61
c8f0432
a0f60b5
1e23e66
399f397
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,9 @@ test: | |
| go test -v ./... | ||
| go test -bench . | ||
|
|
||
| bench: | ||
| go test -bench . | ||
|
|
||
| clean: | ||
| rm -rf ${BUILDDIR} | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| module github.com/Automattic/go-search-replace | ||
|
|
||
| go 1.16 | ||
| go 1.23.2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,14 @@ type Replacement struct { | |
| To []byte | ||
| } | ||
|
|
||
| type EscapedDataDetails struct { | ||
| ContentStartIndex int | ||
| ContentEndIndex int | ||
| NextPartIndex int | ||
| CurrentPartIndex int | ||
| OriginalByteSize int | ||
| } | ||
|
|
||
| func main() { | ||
| versionFlag := flag.Bool("version", false, "Show version information") | ||
| flag.Parse() | ||
|
|
@@ -113,7 +121,7 @@ func main() { | |
|
|
||
| go func(line *[]byte) { | ||
| defer wg.Done() | ||
| line = replaceAndFix(line, replacements) | ||
| line = fixLine(line, replacements) | ||
| ch <- *line | ||
| }(&line) | ||
| } | ||
|
|
@@ -129,6 +137,256 @@ func main() { | |
| } | ||
| } | ||
|
|
||
| var debugMode = false | ||
|
|
||
| func Debugf(format string, args ...interface{}) { | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if debugMode { | ||
| fmt.Printf(format, args...) | ||
| } | ||
| } | ||
|
|
||
| func fixLine(line *[]byte, replacements []*Replacement) *[]byte { | ||
| if bytes.Contains(*line, []byte("s:")) { | ||
| line = fixSerializedContent(line, replacements) | ||
| } | ||
|
|
||
| Debugf("Doing global replacements: %s\n", string(*line)) | ||
| // Catch anything left | ||
| for _, replacement := range replacements { | ||
| *line = bytes.ReplaceAll(*line, replacement.From, replacement.To) | ||
| Debugf("After global replacement (from: %s | to: %s): %s\n", replacement.From, replacement.To, string(*line)) | ||
| } | ||
|
|
||
| Debugf("All done: %s\n", string(*line)) | ||
|
|
||
| return line | ||
| } | ||
|
|
||
| func fixSerializedContent(line *[]byte, replacements []*Replacement) *[]byte { | ||
| index := 0 | ||
|
|
||
| var rebuiltLine []byte | ||
|
|
||
| for index < len(*line) { | ||
| Debugf("Start of loop, index: %d\n", index) | ||
| linePart := (*line)[index:] | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| details, err := parseEscapedData(linePart) | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if err != nil { | ||
| if err.Error() == "could not find serialized string prefix" && index == 0 { | ||
| return line | ||
| } | ||
| // we've run out of things to parse, so just break out and append the rest | ||
| rebuiltLine = append(rebuiltLine, linePart...) | ||
| break | ||
| } | ||
|
|
||
| rebuiltLine = append(rebuiltLine, (*line)[index:index+details.CurrentPartIndex]...) | ||
|
|
||
| index = index + details.NextPartIndex | ||
|
|
||
| content := linePart[details.ContentStartIndex : details.ContentEndIndex+1] | ||
|
|
||
| updatedContent := replaceInSerializedBytes(content, replacements) | ||
|
|
||
| // php needs the unescaped length, so let's unescape it and measure the length | ||
| contentLength := len(unescapeContent(updatedContent)) | ||
|
|
||
| // but if the content never changed, we'll let the error be for safety. | ||
| if bytes.Equal(content, updatedContent) { | ||
| contentLength = details.OriginalByteSize | ||
| } | ||
|
|
||
| // and we rebuild the string | ||
| rebuilt := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(updatedContent) + "\\\";" | ||
|
|
||
| rebuiltLine = append(rebuiltLine, []byte(rebuilt)...) | ||
| } | ||
|
|
||
| return &rebuiltLine | ||
| } | ||
|
|
||
| func replaceInSerializedBytes(serialized []byte, replacements []*Replacement) []byte { | ||
| for _, replacement := range replacements { | ||
| serialized = bytes.ReplaceAll(serialized, replacement.From, replacement.To) | ||
| } | ||
| return serialized | ||
| } | ||
|
|
||
| var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):`) | ||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| // Parses escaped data, returning the location details for further parsing | ||
| func parseEscapedData(linePart []byte) (*EscapedDataDetails, error) { | ||
|
|
||
| details := EscapedDataDetails{ | ||
| ContentStartIndex: 0, | ||
| ContentEndIndex: 0, | ||
| NextPartIndex: 0, | ||
| CurrentPartIndex: 0, | ||
| OriginalByteSize: 0, | ||
| } | ||
|
|
||
| // find starting point in the line | ||
| //TODO: We should first check if we found the string when inside a quote or not. | ||
| // but currently skipping that scenario because it seems unlikely to find it outside. | ||
|
||
| match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart) | ||
| if match == nil { | ||
| return nil, fmt.Errorf("could not find serialized string prefix") | ||
| } | ||
|
|
||
| matchedAt := match[0] | ||
| originalBytes := linePart[match[2]:match[3]] | ||
|
|
||
| details.OriginalByteSize, _ = strconv.Atoi(string(originalBytes)) | ||
|
|
||
| details.CurrentPartIndex = matchedAt | ||
|
|
||
| // the following assumes escaped double quotes | ||
| //TODO: MySQL can optionally not escape the double quote, | ||
| // but generally sqldumps always include the quotes. | ||
| initialContentIndex := match[3] + 3 | ||
|
|
||
| details.ContentStartIndex = initialContentIndex | ||
|
|
||
| currentContentIndex := initialContentIndex | ||
|
|
||
| contentByteCount := 0 | ||
|
|
||
| var nextPartIndex int | ||
|
|
||
| backslash := byte('\\') | ||
| semicolon := byte(';') | ||
| quote := byte('"') | ||
| nextPartFound := false | ||
|
|
||
| secondMatch := serializedStringPrefixRegexp.FindSubmatchIndex(linePart[matchedAt+1:]) | ||
|
|
||
| maxIndex := len(linePart) - 1 | ||
|
|
||
| if secondMatch != nil { | ||
| maxIndex = secondMatch[0] + matchedAt | ||
| } | ||
|
|
||
| // let's find where the content actually ends. | ||
| // it should end when the unescaped value is `";` | ||
| for currentContentIndex < len(linePart) { | ||
| if currentContentIndex+2 > maxIndex { | ||
|
|
||
| // this algorithm SHOULD work, but in cases where the original byte count does not match | ||
| // the actual byte count, it'll error out. We'll add this safeguard here. | ||
| return nil, fmt.Errorf("faulty data, byte count does not match data size") | ||
| } | ||
| char := linePart[currentContentIndex] | ||
| secondChar := linePart[currentContentIndex+1] | ||
| thirdChar := linePart[currentContentIndex+2] | ||
| if char == backslash && contentByteCount < details.OriginalByteSize { | ||
| unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2]) | ||
| // if we get the byte pair without the backslash, it corresponds to a byte | ||
| contentByteCount += len(unescapedBytePair) | ||
|
|
||
| // content index count remains the same. | ||
| currentContentIndex += 2 | ||
| continue | ||
| } | ||
|
|
||
| if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= details.OriginalByteSize { | ||
|
|
||
| // since we've filtered out all the escaped value already, this should be the actual end | ||
| nextPartIndex = currentContentIndex + 3 | ||
| details.NextPartIndex = nextPartIndex | ||
| // we're at backslash, so we need to minus 1 to get the index where the content finishes | ||
| details.ContentEndIndex = currentContentIndex - 1 | ||
| nextPartFound = true | ||
| break | ||
| } | ||
|
|
||
| contentByteCount++ | ||
| currentContentIndex++ | ||
| } | ||
|
|
||
| if nextPartFound == false { | ||
| return nil, fmt.Errorf("end of serialized string not found") | ||
| } | ||
|
|
||
| return &details, nil | ||
| } | ||
|
|
||
| func getUnescapedBytesIfEscaped(charPair []byte) []byte { | ||
|
|
||
| backslash := byte('\\') | ||
|
|
||
| //escapables := []byte{'\\', '\'', '"', 'n', 'r', 't', 'b', 'f', '0'} | ||
|
|
||
| // a map of the second byte to its actual binary presentation | ||
|
|
||
| // if the first byte is not a backslash, we don't need to do anything | ||
|
|
||
| if charPair[0] != backslash { | ||
| return charPair | ||
| } | ||
|
|
||
| unescapedMap := map[byte]byte{ | ||
| '\\': '\\', | ||
| '\'': '\'', | ||
| '"': '"', | ||
| 'n': '\n', | ||
| 'r': '\r', | ||
| 't': '\t', | ||
| 'b': '\b', | ||
| 'f': '\f', | ||
| '0': '\x00', | ||
| } | ||
abdullah-kasim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| actualByte := unescapedMap[charPair[1]] | ||
|
|
||
| if actualByte != 0 { | ||
| return []byte{actualByte} | ||
| } | ||
|
|
||
| // what if it's not a valid escape? Do nothing - it's considered as already escaped | ||
| return charPair | ||
mjangda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func unescapeContent(escaped []byte) []byte { | ||
| unescapedBytes := make([]byte, 0, len(escaped)) | ||
| index := 0 | ||
|
|
||
| // only applies to content - do not apply to raw mysql query | ||
| // tested with php -i, mysql client, and mysqldump and mydumper. | ||
| // 1. \" in dump becomes " when inserting a mysql row. | ||
| // 2. \\ in dump becomes \ when inserting a mysql row. | ||
| // 3. \' in dump becomes ' when inserting a mysql row. | ||
| // 4. mysql translates newline into \n when creating a mysqldump. Same applies to carriage return. | ||
| // 5. PHP serialize does not convert the bytes \r or \n into something else - they're as-is. | ||
| // 6. If using single quotes in php, \r and \n does not get converted into bytes - they become literal backslash and letter. | ||
| // Generally, to unescape, we need to do the following: | ||
| // 1. Convert \\ to \ | ||
| // 2. Convert \' to ' | ||
| // 3. Convert \" to " | ||
|
|
||
abdullah-kasim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| backslash := byte('\\') | ||
|
|
||
| for index < len(escaped) { | ||
|
|
||
| if escaped[index] == backslash { | ||
| unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2]) | ||
| byteLength := len(unescapedBytePair) | ||
|
|
||
| if byteLength == 1 { | ||
| unescapedBytes = append(unescapedBytes, unescapedBytePair...) | ||
| index = index + 2 | ||
| continue | ||
| } | ||
| } | ||
|
|
||
| unescapedBytes = append(unescapedBytes, escaped[index]) | ||
| index++ | ||
| } | ||
|
|
||
| return unescapedBytes | ||
| } | ||
|
|
||
| func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte { | ||
| for _, replacement := range replacements { | ||
| if !bytes.Contains(*line, replacement.From) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.