Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
622e13c
Add new tests to catch issues with serialized markers
mjangda Feb 7, 2024
8d40ea3
Add additional tests for line fix function
mjangda Feb 7, 2024
6c08440
Properly handle serialized strings
mjangda Feb 7, 2024
075f58f
Make it faster to run when there are no changes
mjangda Feb 8, 2024
331f65d
Merge branch 'trunk' into update/serialized-lengths
mjangda Mar 21, 2024
53b2087
Rewrote the serialization fix to employ a mini-parser that consists o…
abdullah-kasim Oct 7, 2024
a960aaa
Fix wrong last-index offset for complex serialized object, and fix te…
abdullah-kasim Oct 7, 2024
6c3f03c
Clarify on how `unescapeContent` works based on the latest understanding
abdullah-kasim Oct 8, 2024
77695d9
Reword how getUnescapedBytesIfEscaped works
abdullah-kasim Oct 8, 2024
17a5a0a
Use a stricter regex for detecting a serialized value
abdullah-kasim Oct 8, 2024
9a52030
Fix case where the non-serial replacement is applying itself to the s…
abdullah-kasim Oct 8, 2024
1652adf
Fix case where we have both serial and non-serial replacement
abdullah-kasim Oct 8, 2024
dff5ac8
Massively simplify the solution to decrease reliance on indexes and f…
abdullah-kasim Oct 8, 2024
03c2e62
Remove unused structs
abdullah-kasim Oct 8, 2024
3e90a3b
Add tests for escaped delimiters
abdullah-kasim Oct 9, 2024
eeeaa7a
Switch TestMultiReplace to run for the new, replacement function instead
abdullah-kasim Oct 9, 2024
cbb2f61
Merge branch 'trunk' into update/seralized-lengths-v2
mjangda Oct 9, 2024
c8f0432
Remove TODO and make the not-checking-for-quotes behavior something t…
abdullah-kasim Oct 14, 2024
a0f60b5
Removed another TODO and more comments on why we're shifting by 3
abdullah-kasim Oct 14, 2024
1e23e66
Better error messages, and separate out error messages from different…
abdullah-kasim Oct 14, 2024
399f397
Remove Debugf
abdullah-kasim Oct 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ test:
go test -v ./...
go test -bench .

bench:
go test -bench .

clean:
rm -rf ${BUILDDIR}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module github.com/Automattic/go-search-replace

go 1.16
go 1.23.2
22 changes: 22 additions & 0 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,25 @@ func TestMultipleReplaceWithoutNewlineAtEOF(t *testing.T) {
expected := "Space, the final frontier!\nCheck out: warp://ncc-1701-d.space/decks/10/areas/forward"
doMainTest(t, input, expected, mainArgs)
}

func TestSerializedReplaceWithCss(t *testing.T) {
mainArgs := []string{
"https://uss-enterprise.com",
"https://ncc-1701-d.space",
}

input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:208:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:206:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
doMainTest(t, input, expected, mainArgs)
}

func TestSerializedReplaceWithCssAndUnrelatedSerializationMarker(t *testing.T) {
mainArgs := []string{
"https://uss-enterprise.com",
"https://ncc-1701-d.space",
}

input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:239:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:237:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
doMainTest(t, input, expected, mainArgs)
}
260 changes: 259 additions & 1 deletion search-replace.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ type Replacement struct {
To []byte
}

type EscapedDataDetails struct {
ContentStartIndex int
ContentEndIndex int
NextPartIndex int
CurrentPartIndex int
OriginalByteSize int
}

func main() {
versionFlag := flag.Bool("version", false, "Show version information")
flag.Parse()
Expand Down Expand Up @@ -113,7 +121,7 @@ func main() {

go func(line *[]byte) {
defer wg.Done()
line = replaceAndFix(line, replacements)
line = fixLine(line, replacements)
ch <- *line
}(&line)
}
Expand All @@ -129,6 +137,256 @@ func main() {
}
}

var debugMode = false

func Debugf(format string, args ...interface{}) {
if debugMode {
fmt.Printf(format, args...)
}
}

func fixLine(line *[]byte, replacements []*Replacement) *[]byte {
if bytes.Contains(*line, []byte("s:")) {
line = fixSerializedContent(line, replacements)
}

Debugf("Doing global replacements: %s\n", string(*line))
// Catch anything left
for _, replacement := range replacements {
*line = bytes.ReplaceAll(*line, replacement.From, replacement.To)
Debugf("After global replacement (from: %s | to: %s): %s\n", replacement.From, replacement.To, string(*line))
}

Debugf("All done: %s\n", string(*line))

return line
}

func fixSerializedContent(line *[]byte, replacements []*Replacement) *[]byte {
index := 0

var rebuiltLine []byte

for index < len(*line) {
Debugf("Start of loop, index: %d\n", index)
linePart := (*line)[index:]

details, err := parseEscapedData(linePart)

if err != nil {
if err.Error() == "could not find serialized string prefix" && index == 0 {
return line
}
// we've run out of things to parse, so just break out and append the rest
rebuiltLine = append(rebuiltLine, linePart...)
break
}

rebuiltLine = append(rebuiltLine, (*line)[index:index+details.CurrentPartIndex]...)

index = index + details.NextPartIndex

content := linePart[details.ContentStartIndex : details.ContentEndIndex+1]

updatedContent := replaceInSerializedBytes(content, replacements)

// php needs the unescaped length, so let's unescape it and measure the length
contentLength := len(unescapeContent(updatedContent))

// but if the content never changed, we'll let the error be for safety.
if bytes.Equal(content, updatedContent) {
contentLength = details.OriginalByteSize
}

// and we rebuild the string
rebuilt := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(updatedContent) + "\\\";"

rebuiltLine = append(rebuiltLine, []byte(rebuilt)...)
}

return &rebuiltLine
}

func replaceInSerializedBytes(serialized []byte, replacements []*Replacement) []byte {
for _, replacement := range replacements {
serialized = bytes.ReplaceAll(serialized, replacement.From, replacement.To)
}
return serialized
}

var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):`)

// Parses escaped data, returning the location details for further parsing
func parseEscapedData(linePart []byte) (*EscapedDataDetails, error) {

details := EscapedDataDetails{
ContentStartIndex: 0,
ContentEndIndex: 0,
NextPartIndex: 0,
CurrentPartIndex: 0,
OriginalByteSize: 0,
}

// find starting point in the line
//TODO: We should first check if we found the string when inside a quote or not.
// but currently skipping that scenario because it seems unlikely to find it outside.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather hard to do, but it's possible that the regex matches inside say, a JSON, and it'd have some sort of multi-layer escaping. Someone might also accidentally use the same combination of string i.e. s:123:", as maybe he's writing a programming tutorial. So we'd accidentally match these, and we should match as little of these as possible, because it'll affect the parser's stability.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just remove the TODO and leave the comment as a note about the edge case?

Copy link
Contributor Author

@abdullah-kasim abdullah-kasim Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, that makes sense.

match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart)
if match == nil {
return nil, fmt.Errorf("could not find serialized string prefix")
}

matchedAt := match[0]
originalBytes := linePart[match[2]:match[3]]

details.OriginalByteSize, _ = strconv.Atoi(string(originalBytes))

details.CurrentPartIndex = matchedAt

// the following assumes escaped double quotes
//TODO: MySQL can optionally not escape the double quote,
// but generally sqldumps always include the quotes.
initialContentIndex := match[3] + 3

details.ContentStartIndex = initialContentIndex

currentContentIndex := initialContentIndex

contentByteCount := 0

var nextPartIndex int

backslash := byte('\\')
semicolon := byte(';')
quote := byte('"')
nextPartFound := false

secondMatch := serializedStringPrefixRegexp.FindSubmatchIndex(linePart[matchedAt+1:])

maxIndex := len(linePart) - 1

if secondMatch != nil {
maxIndex = secondMatch[0] + matchedAt
}

// let's find where the content actually ends.
// it should end when the unescaped value is `";`
for currentContentIndex < len(linePart) {
if currentContentIndex+2 > maxIndex {

// this algorithm SHOULD work, but in cases where the original byte count does not match
// the actual byte count, it'll error out. We'll add this safeguard here.
return nil, fmt.Errorf("faulty data, byte count does not match data size")
}
char := linePart[currentContentIndex]
secondChar := linePart[currentContentIndex+1]
thirdChar := linePart[currentContentIndex+2]
if char == backslash && contentByteCount < details.OriginalByteSize {
unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2])
// if we get the byte pair without the backslash, it corresponds to a byte
contentByteCount += len(unescapedBytePair)

// content index count remains the same.
currentContentIndex += 2
continue
}

if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= details.OriginalByteSize {

// since we've filtered out all the escaped value already, this should be the actual end
nextPartIndex = currentContentIndex + 3
details.NextPartIndex = nextPartIndex
// we're at backslash, so we need to minus 1 to get the index where the content finishes
details.ContentEndIndex = currentContentIndex - 1
nextPartFound = true
break
}

contentByteCount++
currentContentIndex++
}

if nextPartFound == false {
return nil, fmt.Errorf("end of serialized string not found")
}

return &details, nil
}

func getUnescapedBytesIfEscaped(charPair []byte) []byte {

backslash := byte('\\')

//escapables := []byte{'\\', '\'', '"', 'n', 'r', 't', 'b', 'f', '0'}

// a map of the second byte to its actual binary presentation

// if the first byte is not a backslash, we don't need to do anything

if charPair[0] != backslash {
return charPair
}

unescapedMap := map[byte]byte{
'\\': '\\',
'\'': '\'',
'"': '"',
'n': '\n',
'r': '\r',
't': '\t',
'b': '\b',
'f': '\f',
'0': '\x00',
}

actualByte := unescapedMap[charPair[1]]

if actualByte != 0 {
return []byte{actualByte}
}

// what if it's not a valid escape? Do nothing - it's considered as already escaped
return charPair
}

func unescapeContent(escaped []byte) []byte {
unescapedBytes := make([]byte, 0, len(escaped))
index := 0

// only applies to content - do not apply to raw mysql query
// tested with php -i, mysql client, and mysqldump and mydumper.
// 1. \" in dump becomes " when inserting a mysql row.
// 2. \\ in dump becomes \ when inserting a mysql row.
// 3. \' in dump becomes ' when inserting a mysql row.
// 4. mysql translates newline into \n when creating a mysqldump. Same applies to carriage return.
// 5. PHP serialize does not convert the bytes \r or \n into something else - they're as-is.
// 6. If using single quotes in php, \r and \n does not get converted into bytes - they become literal backslash and letter.
// Generally, to unescape, we need to do the following:
// 1. Convert \\ to \
// 2. Convert \' to '
// 3. Convert \" to "

backslash := byte('\\')

for index < len(escaped) {

if escaped[index] == backslash {
unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2])
byteLength := len(unescapedBytePair)

if byteLength == 1 {
unescapedBytes = append(unescapedBytes, unescapedBytePair...)
index = index + 2
continue
}
}

unescapedBytes = append(unescapedBytes, escaped[index])
index++
}

return unescapedBytes
}

func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte {
for _, replacement := range replacements {
if !bytes.Contains(*line, replacement.From) {
Expand Down
Loading