Skip to content

Commit dff5ac8

Browse files
Massively simplify the solution to decrease reliance on indexes and facing off-by-one errors by improving the data structure
1 parent 1652adf commit dff5ac8

File tree

1 file changed

+78
-165
lines changed

1 file changed

+78
-165
lines changed

search-replace.go

Lines changed: 78 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,28 @@ type Replacement struct {
3939
}
4040

4141
type EscapedDataDetails struct {
42-
ContentStartIndex int
43-
ContentEndIndex int
44-
NextPartIndex int
45-
CurrentPartIndex int
46-
OriginalByteSize int
42+
ContentStartIndex int
43+
ContentEndIndex int
44+
NextPartIndex int
45+
CurrentPartIndex int
46+
OriginalByteSize int
47+
SerializedPartRange SerializedPartRange
4748
}
4849

49-
type SerializedContentRange struct {
50+
type SerializedPartRange struct {
5051
From int
5152
To int
5253
}
5354

55+
type SerializedReplaceResult struct {
56+
Pre []byte
57+
SerializedPortion []byte
58+
Post []byte
59+
}
60+
5461
type SerializedContentReplacement struct {
5562
FixedContent []byte
56-
SerializedContentRange []SerializedContentRange
63+
SerializedContentRange []SerializedPartRange
5764
}
5865

5966
type LinePartWithType struct {
@@ -161,201 +168,87 @@ func Debugf(format string, args ...interface{}) {
161168
}
162169

163170
func fixLine(line *[]byte, replacements []*Replacement) *[]byte {
164-
var fixedSerializedContent *SerializedContentReplacement = nil
165-
166-
if bytes.Contains(*line, []byte("s:")) {
167-
fixedSerializedContent = fixSerializedContent(line, replacements)
168-
line = &fixedSerializedContent.FixedContent
169-
}
170171

171172
Debugf("Doing global replacements: %s\n", string(*line))
172173

173-
var linePartsWithType []LinePartWithType
174-
175-
if fixedSerializedContent != nil {
176-
index := 0
177-
178-
for _, serializedContentRange := range fixedSerializedContent.SerializedContentRange {
179-
linePartsWithType = append(linePartsWithType, LinePartWithType{
180-
Content: (*line)[index:serializedContentRange.From],
181-
PhpSerialized: false,
182-
}, LinePartWithType{
183-
Content: (*line)[serializedContentRange.From : serializedContentRange.To+1],
184-
PhpSerialized: true,
185-
})
186-
187-
index = serializedContentRange.To + 1
188-
}
189-
190-
lastIndex := len(*line) - 1
191-
192-
if index <= lastIndex {
193-
linePartsWithType = append(linePartsWithType, LinePartWithType{
194-
Content: (*line)[index : lastIndex+1],
195-
PhpSerialized: false,
196-
})
197-
}
198-
} else {
199-
linePartsWithType = []LinePartWithType{
200-
{
201-
Content: *line,
202-
PhpSerialized: false,
203-
},
204-
}
205-
}
206-
207-
// Catch anything left
208-
for _, replacement := range replacements {
209-
for index, linePartWithType := range linePartsWithType {
210-
if linePartWithType.PhpSerialized == false {
211-
linePartsWithType[index].Content = bytes.ReplaceAll(linePartWithType.Content, replacement.From, replacement.To)
212-
213-
Debugf("After replacing unserialized part (from: %s | to: %s): %s\n", replacement.From, replacement.To, string(linePartsWithType[index].Content))
214-
}
215-
}
216-
}
217-
218-
rebuiltLine := bytes.Join(func() [][]byte {
219-
parts := make([][]byte, len(linePartsWithType))
220-
for i, part := range linePartsWithType {
221-
parts[i] = part.Content
222-
}
223-
return parts
224-
}(), nil)
225-
226-
*line = rebuiltLine
227-
228-
Debugf("All done: %s\n", string(*line))
229-
230-
return line
231-
}
232-
233-
func fixSerializedContent(line *[]byte, replacements []*Replacement) *SerializedContentReplacement {
234-
index := 0
174+
linePart := *line
235175

236176
var rebuiltLine []byte
237177

238-
var serializedContentRange []SerializedContentRange
239-
240-
var result SerializedContentReplacement
241-
242-
for index < len(*line) {
243-
Debugf("Start of loop, index: %d\n", index)
244-
linePart := (*line)[index:]
245-
246-
details, err := parseEscapedData(linePart)
247-
178+
for len(linePart) > 0 {
179+
result, err := fixLineWithSerializedData(linePart, replacements)
248180
if err != nil {
249-
// if right from the beginning, we couldn't find any string prefix,
250-
if err.Error() == "could not find serialized string prefix" && index == 0 {
251-
result = SerializedContentReplacement{
252-
SerializedContentRange: serializedContentRange,
253-
FixedContent: *line,
254-
}
255-
return &result
256-
}
257-
258-
// we've run out of things to parse, so just break out and append the rest
181+
Debugf("Error when trying to fix line : %s\n", err.Error())
259182
rebuiltLine = append(rebuiltLine, linePart...)
260183
break
261184
}
262-
263-
// append all the string right before the part we found the next serialized string
264-
rebuiltLine = append(rebuiltLine, (*line)[index:index+details.CurrentPartIndex]...)
265-
266-
content := linePart[details.ContentStartIndex : details.ContentEndIndex+1]
267-
268-
updatedContent := replaceInSerializedBytes(content, replacements)
269-
270-
// php needs the unescaped length, so let's unescape it and measure the length
271-
contentLength := len(unescapeContent(updatedContent))
272-
273-
// but if the content never changed, and there's an error in the original byte size, we'll let the error be for safety.
274-
if bytes.Equal(content, updatedContent) {
275-
contentLength = details.OriginalByteSize
276-
}
277-
278-
// and we rebuild the string
279-
rebuilt := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(updatedContent) + "\\\";"
280-
281-
rebuiltLine = append(rebuiltLine, []byte(rebuilt)...)
282-
serializedContentRange = append(serializedContentRange, SerializedContentRange{
283-
From: index + details.CurrentPartIndex,
284-
To: index + len(rebuilt) - 1,
285-
})
286-
287-
index = index + details.NextPartIndex
185+
rebuiltLine = append(rebuiltLine, result.Pre...)
186+
rebuiltLine = append(rebuiltLine, result.SerializedPortion...)
187+
linePart = result.Post
288188
}
289189

290-
result = SerializedContentReplacement{
291-
SerializedContentRange: serializedContentRange,
292-
FixedContent: rebuiltLine,
293-
}
190+
*line = rebuiltLine
294191

295-
return &result
192+
Debugf("All done: %s\n", string(*line))
193+
194+
return line
296195
}
297196

298-
func replaceInSerializedBytes(serialized []byte, replacements []*Replacement) []byte {
197+
func replaceByPart(part []byte, replacements []*Replacement) []byte {
299198
for _, replacement := range replacements {
300-
serialized = bytes.ReplaceAll(serialized, replacement.From, replacement.To)
199+
part = bytes.ReplaceAll(part, replacement.From, replacement.To)
301200
}
302-
return serialized
201+
return part
303202
}
304203

305204
var serializedStringPrefixRegexp = regexp.MustCompile(`s:(\d+):\\"`)
306205

307-
// Parses escaped data, returning the location details for further parsing
308-
func parseEscapedData(linePart []byte) (*EscapedDataDetails, error) {
309-
310-
details := EscapedDataDetails{
311-
ContentStartIndex: 0,
312-
ContentEndIndex: 0,
313-
NextPartIndex: 0,
314-
CurrentPartIndex: 0,
315-
OriginalByteSize: 0,
316-
}
206+
func fixLineWithSerializedData(linePart []byte, replacements []*Replacement) (*SerializedReplaceResult, error) {
317207

318208
// find starting point in the line
319209
//TODO: We should first check if we found the string when inside a quote or not.
320210
// but currently skipping that scenario because it seems unlikely to find it outside.
321211
match := serializedStringPrefixRegexp.FindSubmatchIndex(linePart)
322212
if match == nil {
323-
return nil, fmt.Errorf("could not find serialized string prefix")
213+
return &SerializedReplaceResult{
214+
Pre: replaceByPart(linePart, replacements),
215+
SerializedPortion: []byte{},
216+
Post: []byte{},
217+
}, nil
324218
}
325219

326-
matchedAt := match[0]
327-
originalBytes := linePart[match[2]:match[3]]
220+
pre := append([]byte{}, linePart[:match[0]]...)
328221

329-
details.OriginalByteSize, _ = strconv.Atoi(string(originalBytes))
222+
pre = replaceByPart(pre, replacements)
330223

331-
details.CurrentPartIndex = matchedAt
224+
if pre == nil {
225+
pre = []byte{}
226+
}
227+
228+
originalBytes := linePart[match[2]:match[3]]
229+
230+
originalByteSize, _ := strconv.Atoi(string(originalBytes))
332231

333232
// the following assumes escaped double quotes
334233
//TODO: MySQL can optionally not escape the double quote,
335234
// but generally sqldumps always include the quotes.
336-
initialContentIndex := match[3] + 3
235+
contentStartIndex := match[3] + 3
337236

338-
details.ContentStartIndex = initialContentIndex
339-
340-
currentContentIndex := initialContentIndex
237+
currentContentIndex := contentStartIndex
341238

342239
contentByteCount := 0
343240

344-
var nextPartIndex int
241+
contentEndIndex := 0
242+
243+
var nextSliceIndex int
345244

346245
backslash := byte('\\')
347246
semicolon := byte(';')
348247
quote := byte('"')
349-
nextPartFound := false
350-
351-
secondMatch := serializedStringPrefixRegexp.FindSubmatchIndex(linePart[matchedAt+1:])
248+
nextSliceFound := false
352249

353250
maxIndex := len(linePart) - 1
354251

355-
if secondMatch != nil {
356-
maxIndex = secondMatch[0] + matchedAt
357-
}
358-
359252
// let's find where the content actually ends.
360253
// it should end when the unescaped value is `";`
361254
for currentContentIndex < len(linePart) {
@@ -368,7 +261,7 @@ func parseEscapedData(linePart []byte) (*EscapedDataDetails, error) {
368261
char := linePart[currentContentIndex]
369262
secondChar := linePart[currentContentIndex+1]
370263
thirdChar := linePart[currentContentIndex+2]
371-
if char == backslash && contentByteCount < details.OriginalByteSize {
264+
if char == backslash && contentByteCount < originalByteSize {
372265
unescapedBytePair := getUnescapedBytesIfEscaped(linePart[currentContentIndex : currentContentIndex+2])
373266
// if we get the byte pair without the backslash, it corresponds to a byte
374267
contentByteCount += len(unescapedBytePair)
@@ -378,26 +271,46 @@ func parseEscapedData(linePart []byte) (*EscapedDataDetails, error) {
378271
continue
379272
}
380273

381-
if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= details.OriginalByteSize {
274+
if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize {
275+
276+
// we're at backslash
382277

383-
// since we've filtered out all the escaped value already, this should be the actual end
384-
nextPartIndex = currentContentIndex + 3
385-
details.NextPartIndex = nextPartIndex
278+
// index of the beginning of the next slice
279+
nextSliceIndex = currentContentIndex + 3
386280
// we're at backslash, so we need to minus 1 to get the index where the content finishes
387-
details.ContentEndIndex = currentContentIndex - 1
388-
nextPartFound = true
281+
contentEndIndex = currentContentIndex - 1
282+
nextSliceFound = true
389283
break
390284
}
391285

286+
if contentByteCount > originalByteSize {
287+
return nil, fmt.Errorf("faulty data, byte count does not match data size")
288+
}
289+
392290
contentByteCount++
393291
currentContentIndex++
394292
}
395293

396-
if nextPartFound == false {
294+
content := append([]byte{}, linePart[contentStartIndex:contentEndIndex+1]...)
295+
296+
content = replaceByPart(content, replacements)
297+
298+
contentLength := len(unescapeContent(content))
299+
300+
// and we rebuild the string
301+
rebuiltSerializedString := "s:" + strconv.Itoa(contentLength) + ":\\\"" + string(content) + "\\\";"
302+
303+
if nextSliceFound == false {
397304
return nil, fmt.Errorf("end of serialized string not found")
398305
}
399306

400-
return &details, nil
307+
result := SerializedReplaceResult{
308+
Pre: pre,
309+
SerializedPortion: []byte(rebuiltSerializedString),
310+
Post: linePart[nextSliceIndex:],
311+
}
312+
313+
return &result, nil
401314
}
402315

403316
func getUnescapedBytesIfEscaped(charPair []byte) []byte {

0 commit comments

Comments
 (0)