Automattic · abdullah-kasim · Oct 14, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/Makefile b/Makefile
@@ -20,6 +20,9 @@ test:
 	go test -v ./...
 	go test -bench .
 
+bench:
+	go test -bench .
+
 clean:
 	rm -rf ${BUILDDIR}
 

diff --git a/go.mod b/go.mod
@@ -1,3 +1,3 @@
 module github.com/Automattic/go-search-replace
 
-go 1.16
+go 1.23.2
diff --git a/main_test.go b/main_test.go
@@ -84,3 +84,25 @@ func TestMultipleReplaceWithoutNewlineAtEOF(t *testing.T) {
 	expected := "Space, the final frontier!\nCheck out: warp://ncc-1701-d.space/decks/10/areas/forward"
 	doMainTest(t, input, expected, mainArgs)
 }
+
+func TestSerializedReplaceWithCss(t *testing.T) {
+	mainArgs := []string{
+		"https://uss-enterprise.com",
+		"https://ncc-1701-d.space",
+	}
+
+	input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:208:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n  background-position: left center;\r\n    background-repeat: no-repeat; }\";}`
+	expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:206:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n  background-position: left center;\r\n    background-repeat: no-repeat; }\";}`
+	doMainTest(t, input, expected, mainArgs)
+}
+
+func TestSerializedReplaceWithCssAndUnrelatedSerializationMarker(t *testing.T) {
+	mainArgs := []string{
+		"https://uss-enterprise.com",
+		"https://ncc-1701-d.space",
+	}
+
+	input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:239:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n  background-position: left center;\r\n    background-repeat: no-repeat; }\";}`
+	expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:237:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n  background-position: left center;\r\n    background-repeat: no-repeat; }\";}`
+	doMainTest(t, input, expected, mainArgs)
+}
diff --git a/search-replace.go b/search-replace.go
@@ -38,6 +38,12 @@ type Replacement struct {
 	To   []byte
 }
 
+type SerializedReplaceResult struct {
+	Pre               []byte
+	SerializedPortion []byte
+	Post              []byte
+}
+
 func main() {
 	versionFlag := flag.Bool("version", false, "Show version information")
 	flag.Parse()
@@ -113,7 +119,7 @@ func main() {
 
 			go func(line *[]byte) {
 				defer wg.Done()
-				line = replaceAndFix(line, replacements)
+				line = fixLine(line, replacements)
 				ch <- *line
 			}(&line)
 		}
@@ -129,6 +135,223 @@ func main() {
 	}
 }
 
+var debugMode = false
+
+func Debugf(format string, args ...interface{}) {
+	if debugMode {
+		fmt.Printf(format, args...)
+	}
+}
+
+func fixLine(line *[]byte, replacements []*Replacement) *[]byte {
+
+	Debugf("Doing global replacements: %s\n", string(*line))
+
+	linePart := *line
+
+	var rebuiltLine []byte
+
+	for len(linePart) > 0 {
+		result, err := fixLineWithSerializedData(linePart, replacements)
+		if err != nil {
+			Debugf("Error when trying to fix line : %s\n", err.Error())
+			rebuiltLine = append(rebuiltLine, linePart...)
+			break
+		}
+		rebuiltLine = append(rebuiltLine, result.Pre...)
+		rebuiltLine = append(rebuiltLine, result.SerializedPortion...)
+		linePart = result.Post
+	}
+
+	*line = rebuiltLine
+
+	Debugf("All done: %s\n", string(*line))
+
+	return line
+}
+
+func replaceByPart(part []byte, replacements []*Replacement) []byte {
+	for _, replacement := range replacements {
+		part = bytes.ReplaceAll(part, replacement.From, replacement.To)
+	}
+	return part
+}
+
+var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):\\"`)
+
+func fixLineWithSerializedData(linePart []byte, replacements []*Replacement) (*SerializedReplaceResult, error) {
+
+	// find starting point in the line
+	//TODO: We should first check if we found the string when inside a quote or not.
+	// but currently skipping that scenario because it seems unlikely to find it outside.
+	match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart)
+	if match == nil {
+		return &SerializedReplaceResult{
+			Pre:               replaceByPart(linePart, replacements),
+			SerializedPortion: []byte{},
+			Post:              []byte{},
+		}, nil
+	}
+
+	pre := append([]byte{}, linePart[:match[0]]...)
+
+	pre = replaceByPart(pre, replacements)
+
+	if pre == nil {
+		pre = []byte{}
+	}
+
+	originalBytes := linePart[match[2]:match[3]]
+
+	originalByteSize, _ := strconv.Atoi(string(originalBytes))
+
+	// the following assumes escaped double quotes
+	//TODO: MySQL can optionally not escape the double quote,
+	// but generally sqldumps always include the quotes.
+	contentStartIndex := match[3] + 3
+
+	currentContentIndex := contentStartIndex
+
+	contentByteCount := 0
+
+	contentEndIndex := 0
+
+	var nextSliceIndex int
+
+	backslash := byte('\\')
+	semicolon := byte(';')
+	quote := byte('"')
+	nextSliceFound := false
+
+	maxIndex := len(linePart) - 1
+
+	// let's find where the content actually ends.
+	// it should end when the unescaped value is `";`
+	for currentContentIndex < len(linePart) {
+		if currentContentIndex+2 > maxIndex {
+
+			// this algorithm SHOULD work, but in cases where the original byte count does not match
+			// the actual byte count, it'll error out. We'll add this safeguard here.
+			return nil, fmt.Errorf("faulty data, byte count does not match data size")
+		}
+		char := linePart[currentContentIndex]
+		secondChar := linePart[currentContentIndex+1]
+		thirdChar := linePart[currentContentIndex+2]
+		if char == backslash && contentByteCount < originalByteSize {
+			unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2])
+			// if we get the byte pair without the backslash, it corresponds to a byte
+			contentByteCount += len(unescapedBytePair)
+
+			// content index count remains the same.
+			currentContentIndex += 2
+			continue
+		}
+
+		if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize {
+
+			// we're at backslash
+
+			// index of the beginning of the next slice
+			nextSliceIndex = currentContentIndex + 3
+			// we're at backslash, so we need to minus 1 to get the index where the content finishes
+			contentEndIndex = currentContentIndex - 1
+			nextSliceFound = true
+			break
+		}
+
+		if contentByteCount > originalByteSize {
+			return nil, fmt.Errorf("faulty data, byte count does not match data size")
+		}
+
+		contentByteCount++
+		currentContentIndex++
+	}
+
+	content := append([]byte{}, linePart[contentStartIndex:contentEndIndex+1]...)
+
+	content = replaceByPart(content, replacements)
+
+	contentLength := len(unescapeContent(content))
+
+	// and we rebuild the string
+	rebuiltSerializedString := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(content) + "\\\";"
+
+	if nextSliceFound == false {
+		return nil, fmt.Errorf("end of serialized string not found")
+	}
+
+	result := SerializedReplaceResult{
+		Pre:               pre,
+		SerializedPortion: []byte(rebuiltSerializedString),
+		Post:              linePart[nextSliceIndex:],
+	}
+
+	return &result, nil
+}
+
+func getUnescapedBytesIfEscaped(charPair []byte) []byte {
+
+	backslash := byte('\\')
+
+	// if the first byte is not a backslash, we don't need to do anything - we'll return the bytes
+	// as per the function name, we'll return both bytes, or return one byte if one byte is actually an escape character
+	if charPair[0] != backslash {
+		return charPair
+	}
+
+	unescapedMap := map[byte]byte{
+		'\\': '\\',
+		'\'': '\'',
+		'"':  '"',
+		'n':  '\n',
+		'r':  '\r',
+		't':  '\t',
+		'b':  '\b',
+		'f':  '\f',
+		'0':  '\x00',
+	}
+
+	actualByte := unescapedMap[charPair[1]]
+
+	if actualByte != 0 {
+		return []byte{actualByte}
+	}
+
+	// what if it's not a valid escape? Do nothing - it's considered as already escaped
+	return charPair
+}
+
+func unescapeContent(escaped []byte) []byte {
+	unescapedBytes := make([]byte, 0, len(escaped))
+	index := 0
+
+	// only applies to content of a string - do not apply to raw mysql query
+	// tested with php -i, mysql client, and mysqldump and mydumper.
+	// 1. mysql translates certain bytes to `\<char>` i.e. `\n`. So these needs unescaping to get the correct byte length. See `getUnescapedBytesIfEscaped`
+	// 2. PHP serialize does not convert raw bytes into `\<char>` - they're as-is, so we don't need to take into account of escaped value in byte length calculation.
+
+	backslash := byte('\\')
+
+	for index < len(escaped) {
+
+		if escaped[index] == backslash {
+			unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2])
+			byteLength := len(unescapedBytePair)
+
+			if byteLength == 1 {
+				unescapedBytes = append(unescapedBytes, unescapedBytePair...)
+				index = index + 2
+				continue
+			}
+		}
+
+		unescapedBytes = append(unescapedBytes, escaped[index])
+		index++
+	}
+
+	return unescapedBytes
+}
+
 func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte {
 	for _, replacement := range replacements {
 		if !bytes.Contains(*line, replacement.From) {