Skip to content

Commit afaa434

Browse files
Merge pull request #43 from Automattic/update/seralized-lengths-v2
2 parents 20b5cf6 + 399f397 commit afaa434

File tree

5 files changed

+373
-7
lines changed

5 files changed

+373
-7
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ test:
2020
go test -v ./...
2121
go test -bench .
2222

23+
bench:
24+
go test -bench .
25+
2326
clean:
2427
rm -rf ${BUILDDIR}
2528

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module github.com/Automattic/go-search-replace
22

3-
go 1.16
3+
go 1.23.2

main_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,25 @@ func TestMultipleReplaceWithoutNewlineAtEOF(t *testing.T) {
8484
expected := "Space, the final frontier!\nCheck out: warp://ncc-1701-d.space/decks/10/areas/forward"
8585
doMainTest(t, input, expected, mainArgs)
8686
}
87+
88+
func TestSerializedReplaceWithCss(t *testing.T) {
89+
mainArgs := []string{
90+
"https://uss-enterprise.com",
91+
"https://ncc-1701-d.space",
92+
}
93+
94+
input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:208:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
95+
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:206:\"body { color: #123456;\r\nborder-bottom: none; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
96+
doMainTest(t, input, expected, mainArgs)
97+
}
98+
99+
func TestSerializedReplaceWithCssAndUnrelatedSerializationMarker(t *testing.T) {
100+
mainArgs := []string{
101+
"https://uss-enterprise.com",
102+
"https://ncc-1701-d.space",
103+
}
104+
105+
input := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:239:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://uss-enterprise.com/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
106+
expected := `a:2:{s:3:\"key\";s:5:\"value\";s:3:\"css\";s:237:\"body { color: #123456;\r\nborder-bottom: none; }\r\nbody:after{ content: \"▼\"; }\r\ndiv.bg { background: url('https://ncc-1701-d.space/wp-content/uploads/main-bg.gif');\r\n background-position: left center;\r\n background-repeat: no-repeat; }\";}`
107+
doMainTest(t, input, expected, mainArgs)
108+
}

search-replace.go

Lines changed: 211 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ type Replacement struct {
3838
To []byte
3939
}
4040

41+
type SerializedReplaceResult struct {
42+
Pre []byte
43+
SerializedPortion []byte
44+
Post []byte
45+
}
46+
4147
func main() {
4248
versionFlag := flag.Bool("version", false, "Show version information")
4349
flag.Parse()
@@ -113,7 +119,7 @@ func main() {
113119

114120
go func(line *[]byte) {
115121
defer wg.Done()
116-
line = replaceAndFix(line, replacements)
122+
line = fixLine(line, replacements)
117123
ch <- *line
118124
}(&line)
119125
}
@@ -129,6 +135,210 @@ func main() {
129135
}
130136
}
131137

138+
func fixLine(line *[]byte, replacements []*Replacement) *[]byte {
139+
linePart := *line
140+
141+
var rebuiltLine []byte
142+
143+
for len(linePart) > 0 {
144+
result, err := fixLineWithSerializedData(linePart, replacements)
145+
if err != nil {
146+
rebuiltLine = append(rebuiltLine, linePart...)
147+
break
148+
}
149+
rebuiltLine = append(rebuiltLine, result.Pre...)
150+
rebuiltLine = append(rebuiltLine, result.SerializedPortion...)
151+
linePart = result.Post
152+
}
153+
154+
*line = rebuiltLine
155+
156+
return line
157+
}
158+
159+
func replaceByPart(part []byte, replacements []*Replacement) []byte {
160+
for _, replacement := range replacements {
161+
part = bytes.ReplaceAll(part, replacement.From, replacement.To)
162+
}
163+
return part
164+
}
165+
166+
var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):\\"`)
167+
168+
func fixLineWithSerializedData(linePart []byte, replacements []*Replacement) (*SerializedReplaceResult, error) {
169+
170+
// find starting point in the line
171+
// We're not checking if we found the serialized string prefix inside a quote or not.
172+
// Currently skipping that scenario because it seems unlikely to find it outside.
173+
match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart)
174+
if match == nil {
175+
return &SerializedReplaceResult{
176+
Pre: replaceByPart(linePart, replacements),
177+
SerializedPortion: []byte{},
178+
Post: []byte{},
179+
}, nil
180+
}
181+
182+
pre := append([]byte{}, linePart[:match[0]]...)
183+
184+
pre = replaceByPart(pre, replacements)
185+
186+
if pre == nil {
187+
pre = []byte{}
188+
}
189+
190+
originalBytes := linePart[match[2]:match[3]]
191+
192+
originalByteSize, _ := strconv.Atoi(string(originalBytes))
193+
194+
// the following assumes escaped double quotes
195+
// i.e. s:5:\"x -> we'll need to shift our index from '5' to 'x' - hence shifting by 3
196+
// MySQL can optionally not escape the double quote,
197+
// but generally sqldumps always include the quotes.
198+
contentStartIndex := match[3] + 3
199+
200+
currentContentIndex := contentStartIndex
201+
202+
contentByteCount := 0
203+
204+
contentEndIndex := 0
205+
206+
var nextSliceIndex int
207+
208+
backslash := byte('\\')
209+
semicolon := byte(';')
210+
quote := byte('"')
211+
nextSliceFound := false
212+
213+
maxIndex := len(linePart) - 1
214+
215+
// let's find where the content actually ends.
216+
// it should end when the unescaped value is `";`
217+
for currentContentIndex < len(linePart) {
218+
if currentContentIndex+2 > maxIndex {
219+
220+
// this algorithm SHOULD work, but in cases where the original byte count does not match
221+
// the actual byte count, it'll error out. We'll add this safeguard here.
222+
return nil, fmt.Errorf("faulty serialized data: out-of-bound index access detected")
223+
}
224+
char := linePart[currentContentIndex]
225+
secondChar := linePart[currentContentIndex+1]
226+
thirdChar := linePart[currentContentIndex+2]
227+
if char == backslash && contentByteCount < originalByteSize {
228+
unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2])
229+
// if we get the byte pair without the backslash, it corresponds to a byte
230+
contentByteCount += len(unescapedBytePair)
231+
232+
// content index count remains the same.
233+
currentContentIndex += 2
234+
continue
235+
}
236+
237+
if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize {
238+
239+
// we're at backslash
240+
241+
// index of the beginning of the next slice
242+
nextSliceIndex = currentContentIndex + 3
243+
// we're at backslash, so we need to minus 1 to get the index where the content finishes
244+
contentEndIndex = currentContentIndex - 1
245+
nextSliceFound = true
246+
break
247+
}
248+
249+
if contentByteCount > originalByteSize {
250+
return nil, fmt.Errorf("faulty serialized data: calculated byte count does not match given data size")
251+
}
252+
253+
contentByteCount++
254+
currentContentIndex++
255+
}
256+
257+
content := append([]byte{}, linePart[contentStartIndex:contentEndIndex+1]...)
258+
259+
content = replaceByPart(content, replacements)
260+
261+
contentLength := len(unescapeContent(content))
262+
263+
// and we rebuild the string
264+
rebuiltSerializedString := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(content) + "\\\";"
265+
266+
if nextSliceFound == false {
267+
return nil, fmt.Errorf("faulty serialized data: end of serialized data not found")
268+
}
269+
270+
result := SerializedReplaceResult{
271+
Pre: pre,
272+
SerializedPortion: []byte(rebuiltSerializedString),
273+
Post: linePart[nextSliceIndex:],
274+
}
275+
276+
return &result, nil
277+
}
278+
279+
func getUnescapedBytesIfEscaped(charPair []byte) []byte {
280+
281+
backslash := byte('\\')
282+
283+
// if the first byte is not a backslash, we don't need to do anything - we'll return the bytes
284+
// as per the function name, we'll return both bytes, or return one byte if one byte is actually an escape character
285+
if charPair[0] != backslash {
286+
return charPair
287+
}
288+
289+
unescapedMap := map[byte]byte{
290+
'\\': '\\',
291+
'\'': '\'',
292+
'"': '"',
293+
'n': '\n',
294+
'r': '\r',
295+
't': '\t',
296+
'b': '\b',
297+
'f': '\f',
298+
'0': '\x00',
299+
}
300+
301+
actualByte := unescapedMap[charPair[1]]
302+
303+
if actualByte != 0 {
304+
return []byte{actualByte}
305+
}
306+
307+
// what if it's not a valid escape? Do nothing - it's considered as already escaped
308+
return charPair
309+
}
310+
311+
func unescapeContent(escaped []byte) []byte {
312+
unescapedBytes := make([]byte, 0, len(escaped))
313+
index := 0
314+
315+
// only applies to content of a string - do not apply to raw mysql query
316+
// tested with php -i, mysql client, and mysqldump and mydumper.
317+
// 1. mysql translates certain bytes to `\<char>` i.e. `\n`. So these needs unescaping to get the correct byte length. See `getUnescapedBytesIfEscaped`
318+
// 2. PHP serialize does not convert raw bytes into `\<char>` - they're as-is, so we don't need to take into account of escaped value in byte length calculation.
319+
320+
backslash := byte('\\')
321+
322+
for index < len(escaped) {
323+
324+
if escaped[index] == backslash {
325+
unescapedBytePair := getUnescapedBytesIfEscaped(escaped[index : index+2])
326+
byteLength := len(unescapedBytePair)
327+
328+
if byteLength == 1 {
329+
unescapedBytes = append(unescapedBytes, unescapedBytePair...)
330+
index = index + 2
331+
continue
332+
}
333+
}
334+
335+
unescapedBytes = append(unescapedBytes, escaped[index])
336+
index++
337+
}
338+
339+
return unescapedBytes
340+
}
341+
132342
func replaceAndFix(line *[]byte, replacements []*Replacement) *[]byte {
133343
for _, replacement := range replacements {
134344
if !bytes.Contains(*line, replacement.From) {

0 commit comments

Comments
 (0)