Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
622e13c
Add new tests to catch issues with serialized markers
mjangda Feb 7, 2024
8d40ea3
Add additional tests for line fix function
mjangda Feb 7, 2024
6c08440
Properly handle serialized strings
mjangda Feb 7, 2024
075f58f
Make it faster to run when there are no changes
mjangda Feb 8, 2024
331f65d
Merge branch 'trunk' into update/serialized-lengths
mjangda Mar 21, 2024
53b2087
Rewrote the serialization fix to employ a mini-parser that consists o…
abdullah-kasim Oct 7, 2024
a960aaa
Fix wrong last-index offset for complex serialized object, and fix te…
abdullah-kasim Oct 7, 2024
6c3f03c
Clarify on how `unescapeContent` works based on the latest understanding
abdullah-kasim Oct 8, 2024
77695d9
Reword how getUnescapedBytesIfEscaped works
abdullah-kasim Oct 8, 2024
17a5a0a
Use a stricter regex for detecting a serialized value
abdullah-kasim Oct 8, 2024
9a52030
Fix case where the non-serial replacement is applying itself to the s…
abdullah-kasim Oct 8, 2024
1652adf
Fix case where we have both serial and non-serial replacement
abdullah-kasim Oct 8, 2024
dff5ac8
Massively simplify the solution to decrease reliance on indexes and f…
abdullah-kasim Oct 8, 2024
03c2e62
Remove unused structs
abdullah-kasim Oct 8, 2024
3e90a3b
Add tests for escaped delimiters
abdullah-kasim Oct 9, 2024
eeeaa7a
Switch TestMultiReplace to run for the new, replacement function instead
abdullah-kasim Oct 9, 2024
cbb2f61
Merge branch 'trunk' into update/seralized-lengths-v2
mjangda Oct 9, 2024
c8f0432
Remove TODO and make the not-checking-for-quotes behavior something t…
abdullah-kasim Oct 14, 2024
a0f60b5
Removed another TODO and more comments on why we're shifting by 3
abdullah-kasim Oct 14, 2024
1e23e66
Better error messages, and separate out error messages from different…
abdullah-kasim Oct 14, 2024
399f397
Remove Debugf
abdullah-kasim Oct 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ test:
go test -v ./...
go test -bench .

bench:
go test -bench .

clean:
rm -rf ${BUILDDIR}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module github.com/Automattic/go-search-replace

go 1.16
go 1.23.2
22 changes: 22 additions & 0 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,25 @@ func TestMultipleReplaceWithoutNewlineAtEOF(t *testing.T) {
expected := "Space, the final frontier!\nCheck out: warp://ncc-1701-d.space/decks/10/areas/forward"
doMainTest(t, input, expected, mainArgs)
}

func TestSerializedReplaceWithCss(t *testing.T) {
mainArgs := []string{
"https://uss-enterprise.com",
"https://ncc-1701-d.space",
}

input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:208:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:206:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
doMainTest(t, input, expected, mainArgs)
}

func TestSerializedReplaceWithCssAndUnrelatedSerializationMarker(t *testing.T) {
mainArgs := []string{
"https://uss-enterprise.com",
"https://ncc-1701-d.space",
}

input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:239:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:237:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
doMainTest(t, input, expected, mainArgs)
}
225 changes: 224 additions & 1 deletion search-replace.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ type Replacement struct {
To []byte
}

type SerializedReplaceResult struct {
Pre []byte
SerializedPortion []byte
Post []byte
}

func main() {
versionFlag := flag.Bool("version", false, "Show version information")
flag.Parse()
Expand Down Expand Up @@ -113,7 +119,7 @@ func main() {

go func(line *[]byte) {
defer wg.Done()
line = replaceAndFix(line, replacements)
line = fixLine(line, replacements)
ch <- *line
}(&line)
}
Expand All @@ -129,6 +135,223 @@ func main() {
}
}

var debugMode = false

func Debugf(format string, args ...interface{}) {
if debugMode {
fmt.Printf(format, args...)
}
}

func fixLine(line *[]byte, replacements []*Replacement) *[]byte {

Debugf("Doing global replacements: %s\n", string(*line))

linePart := *line

var rebuiltLine []byte

for len(linePart) > 0 {
result, err := fixLineWithSerializedData(linePart, replacements)
if err != nil {
Debugf("Error when trying to fix line : %s\n", err.Error())
rebuiltLine = append(rebuiltLine, linePart...)
break
}
rebuiltLine = append(rebuiltLine, result.Pre...)
rebuiltLine = append(rebuiltLine, result.SerializedPortion...)
linePart = result.Post
}

*line = rebuiltLine

Debugf("All done: %s\n", string(*line))

return line
}

func replaceByPart(part []byte, replacements []*Replacement) []byte {
for _, replacement := range replacements {
part = bytes.ReplaceAll(part, replacement.From, replacement.To)
}
return part
}

var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):\\"`)

func fixLineWithSerializedData(linePart []byte, replacements []*Replacement) (*SerializedReplaceResult, error) {

// find starting point in the line
//TODO: We should first check if we found the string when inside a quote or not.
// but currently skipping that scenario because it seems unlikely to find it outside.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather hard to do, but it's possible that the regex matches inside say, a JSON, and it'd have some sort of multi-layer escaping. Someone might also accidentally use the same combination of string i.e. s:123:", as maybe he's writing a programming tutorial. So we'd accidentally match these, and we should match as little of these as possible, because it'll affect the parser's stability.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just remove the TODO and leave the comment as a note about the edge case?

Copy link
Contributor Author

@abdullah-kasim abdullah-kasim Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, that makes sense.

match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart)
if match == nil {
return &SerializedReplaceResult{
Pre: replaceByPart(linePart, replacements),
SerializedPortion: []byte{},
Post: []byte{},
}, nil
}

pre := append([]byte{}, linePart[:match[0]]...)

pre = replaceByPart(pre, replacements)

if pre == nil {
pre = []byte{}
}

originalBytes := linePart[match[2]:match[3]]

originalByteSize, _ := strconv.Atoi(string(originalBytes))

// the following assumes escaped double quotes
//TODO: MySQL can optionally not escape the double quote,
// but generally sqldumps always include the quotes.
contentStartIndex := match[3] + 3

currentContentIndex := contentStartIndex

contentByteCount := 0

contentEndIndex := 0

var nextSliceIndex int

backslash := byte('\\')
semicolon := byte(';')
quote := byte('"')
nextSliceFound := false

maxIndex := len(linePart) - 1

// let's find where the content actually ends.
// it should end when the unescaped value is `";`
for currentContentIndex < len(linePart) {
if currentContentIndex+2 > maxIndex {

// this algorithm SHOULD work, but in cases where the original byte count does not match
// the actual byte count, it'll error out. We'll add this safeguard here.
return nil, fmt.Errorf("faulty data, byte count does not match data size")
}
char := linePart[currentContentIndex]
secondChar := linePart[currentContentIndex+1]
thirdChar := linePart[currentContentIndex+2]
if char == backslash && contentByteCount < originalByteSize {
unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2])
// if we get the byte pair without the backslash, it corresponds to a byte
contentByteCount += len(unescapedBytePair)

// content index count remains the same.
currentContentIndex += 2
continue
}

if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize {

// we're at backslash

// index of the beginning of the next slice
nextSliceIndex = currentContentIndex + 3
// we're at backslash, so we need to minus 1 to get the index where the content finishes
contentEndIndex = currentContentIndex - 1
nextSliceFound = true
break
}

if contentByteCount > originalByteSize {
return nil, fmt.Errorf("faulty data, byte count does not match data size")
}

contentByteCount++
currentContentIndex++
}

content := append([]byte{}, linePart[contentStartIndex:contentEndIndex+1]...)

content = replaceByPart(content, replacements)

contentLength := len(unescapeContent(content))

// and we rebuild the string
rebuiltSerializedString := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(content) + "\\\";"

if nextSliceFound == false {
return nil, fmt.Errorf("end of serialized string not found")
}

result := SerializedReplaceResult{
Pre: pre,
SerializedPortion: []byte(rebuiltSerializedString),
Post: linePart[nextSliceIndex:],
}

return &result, nil
}

func getUnescapedBytesIfEscaped(charPair []byte) []byte {

backslash := byte('\\')

// if the first byte is not a backslash, we don't need to do anything - we'll return the bytes
// as per the function name, we'll return both bytes, or return one byte if one byte is actually an escape character
if charPair[0] != backslash {
return charPair
}

unescapedMap := map[byte]byte{
'\\': '\\',
'\'': '\'',
'"': '"',
'n': '\n',
'r': '\r',
't': '\t',
'b': '\b',
'f': '\f',
'0': '\x00',
}

actualByte := unescapedMap[charPair[1]]

if actualByte != 0 {
return []byte{actualByte}
}

// what if it's not a valid escape? Do nothing - it's considered as already escaped
return charPair
}

func unescapeContent(escaped []byte) []byte {
unescapedBytes := make([]byte, 0, len(escaped))
index := 0

// only applies to content of a string - do not apply to raw mysql query
// tested with php -i, mysql client, and mysqldump and mydumper.
// 1. mysql translates certain bytes to `\<char>` i.e. `\n`. So these needs unescaping to get the correct byte length. See `getUnescapedBytesIfEscaped`
// 2. PHP serialize does not convert raw bytes into `\<char>` - they're as-is, so we don't need to take into account of escaped value in byte length calculation.

backslash := byte('\\')

for index < len(escaped) {

if escaped[index] == backslash {
unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2])
byteLength := len(unescapedBytePair)

if byteLength == 1 {
unescapedBytes = append(unescapedBytes, unescapedBytePair...)
index = index + 2
continue
}
}

unescapedBytes = append(unescapedBytes, escaped[index])
index++
}

return unescapedBytes
}

func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte {
for _, replacement := range replacements {
if !bytes.Contains(*line, replacement.From) {
Expand Down
Loading
Loading