@@ -38,6 +38,12 @@ type Replacement struct {
3838 To []byte
3939}
4040
41+ type SerializedReplaceResult struct {
42+ Pre []byte
43+ SerializedPortion []byte
44+ Post []byte
45+ }
46+
4147func main () {
4248 versionFlag := flag .Bool ("version" , false , "Show version information" )
4349 flag .Parse ()
@@ -113,7 +119,7 @@ func main() {
113119
114120 go func (line * []byte ) {
115121 defer wg .Done ()
116- line = replaceAndFix (line , replacements )
122+ line = fixLine (line , replacements )
117123 ch <- * line
118124 }(& line )
119125 }
@@ -129,6 +135,210 @@ func main() {
129135 }
130136}
131137
138+ func fixLine (line * []byte , replacements []* Replacement ) * []byte {
139+ linePart := * line
140+
141+ var rebuiltLine []byte
142+
143+ for len (linePart ) > 0 {
144+ result , err := fixLineWithSerializedData (linePart , replacements )
145+ if err != nil {
146+ rebuiltLine = append (rebuiltLine , linePart ... )
147+ break
148+ }
149+ rebuiltLine = append (rebuiltLine , result .Pre ... )
150+ rebuiltLine = append (rebuiltLine , result .SerializedPortion ... )
151+ linePart = result .Post
152+ }
153+
154+ * line = rebuiltLine
155+
156+ return line
157+ }
158+
159+ func replaceByPart (part []byte , replacements []* Replacement ) []byte {
160+ for _ , replacement := range replacements {
161+ part = bytes .ReplaceAll (part , replacement .From , replacement .To )
162+ }
163+ return part
164+ }
165+
166+ var serializedStringPrefixRegexp = regexp .MustCompile (`s:(\d+):\\"` )
167+
168+ func fixLineWithSerializedData (linePart []byte , replacements []* Replacement ) (* SerializedReplaceResult , error ) {
169+
170+ // find starting point in the line
171+ // We're not checking if we found the serialized string prefix inside a quote or not.
172+ // Currently skipping that scenario because it seems unlikely to find it outside.
173+ match := serializedStringPrefixRegexp .FindSubmatchIndex (linePart )
174+ if match == nil {
175+ return & SerializedReplaceResult {
176+ Pre : replaceByPart (linePart , replacements ),
177+ SerializedPortion : []byte {},
178+ Post : []byte {},
179+ }, nil
180+ }
181+
182+ pre := append ([]byte {}, linePart [:match [0 ]]... )
183+
184+ pre = replaceByPart (pre , replacements )
185+
186+ if pre == nil {
187+ pre = []byte {}
188+ }
189+
190+ originalBytes := linePart [match [2 ]:match [3 ]]
191+
192+ originalByteSize , _ := strconv .Atoi (string (originalBytes ))
193+
194+ // the following assumes escaped double quotes
195+ // i.e. s:5:\"x -> we'll need to shift our index from '5' to 'x' - hence shifting by 3
196+ // MySQL can optionally not escape the double quote,
197+ // but generally sqldumps always include the quotes.
198+ contentStartIndex := match [3 ] + 3
199+
200+ currentContentIndex := contentStartIndex
201+
202+ contentByteCount := 0
203+
204+ contentEndIndex := 0
205+
206+ var nextSliceIndex int
207+
208+ backslash := byte ('\\' )
209+ semicolon := byte (';' )
210+ quote := byte ('"' )
211+ nextSliceFound := false
212+
213+ maxIndex := len (linePart ) - 1
214+
215+ // let's find where the content actually ends.
216+ // it should end when the unescaped value is `";`
217+ for currentContentIndex < len (linePart ) {
218+ if currentContentIndex + 2 > maxIndex {
219+
220+ // this algorithm SHOULD work, but in cases where the original byte count does not match
221+ // the actual byte count, it'll error out. We'll add this safeguard here.
222+ return nil , fmt .Errorf ("faulty serialized data: out-of-bound index access detected" )
223+ }
224+ char := linePart [currentContentIndex ]
225+ secondChar := linePart [currentContentIndex + 1 ]
226+ thirdChar := linePart [currentContentIndex + 2 ]
227+ if char == backslash && contentByteCount < originalByteSize {
228+ unescapedBytePair := getUnescapedBytesIfEscaped (linePart [currentContentIndex : currentContentIndex + 2 ])
229+ // if we get the byte pair without the backslash, it corresponds to a byte
230+ contentByteCount += len (unescapedBytePair )
231+
232+ // content index count remains the same.
233+ currentContentIndex += 2
234+ continue
235+ }
236+
237+ if char == backslash && secondChar == quote && thirdChar == semicolon && contentByteCount >= originalByteSize {
238+
239+ // we're at backslash
240+
241+ // index of the beginning of the next slice
242+ nextSliceIndex = currentContentIndex + 3
243+ // we're at backslash, so we need to minus 1 to get the index where the content finishes
244+ contentEndIndex = currentContentIndex - 1
245+ nextSliceFound = true
246+ break
247+ }
248+
249+ if contentByteCount > originalByteSize {
250+ return nil , fmt .Errorf ("faulty serialized data: calculated byte count does not match given data size" )
251+ }
252+
253+ contentByteCount ++
254+ currentContentIndex ++
255+ }
256+
257+ content := append ([]byte {}, linePart [contentStartIndex :contentEndIndex + 1 ]... )
258+
259+ content = replaceByPart (content , replacements )
260+
261+ contentLength := len (unescapeContent (content ))
262+
263+ // and we rebuild the string
264+ rebuiltSerializedString := "s:" + strconv .Itoa (contentLength ) + ":\\ \" " + string (content ) + "\\ \" ;"
265+
266+ if nextSliceFound == false {
267+ return nil , fmt .Errorf ("faulty serialized data: end of serialized data not found" )
268+ }
269+
270+ result := SerializedReplaceResult {
271+ Pre : pre ,
272+ SerializedPortion : []byte (rebuiltSerializedString ),
273+ Post : linePart [nextSliceIndex :],
274+ }
275+
276+ return & result , nil
277+ }
278+
279+ func getUnescapedBytesIfEscaped (charPair []byte ) []byte {
280+
281+ backslash := byte ('\\' )
282+
283+ // if the first byte is not a backslash, we don't need to do anything - we'll return the bytes
284+ // as per the function name, we'll return both bytes, or return one byte if one byte is actually an escape character
285+ if charPair [0 ] != backslash {
286+ return charPair
287+ }
288+
289+ unescapedMap := map [byte ]byte {
290+ '\\' : '\\' ,
291+ '\'' : '\'' ,
292+ '"' : '"' ,
293+ 'n' : '\n' ,
294+ 'r' : '\r' ,
295+ 't' : '\t' ,
296+ 'b' : '\b' ,
297+ 'f' : '\f' ,
298+ '0' : '\x00' ,
299+ }
300+
301+ actualByte := unescapedMap [charPair [1 ]]
302+
303+ if actualByte != 0 {
304+ return []byte {actualByte }
305+ }
306+
307+ // what if it's not a valid escape? Do nothing - it's considered as already escaped
308+ return charPair
309+ }
310+
311+ func unescapeContent (escaped []byte ) []byte {
312+ unescapedBytes := make ([]byte , 0 , len (escaped ))
313+ index := 0
314+
315+ // only applies to content of a string - do not apply to raw mysql query
316+ // tested with php -i, mysql client, and mysqldump and mydumper.
317+ // 1. mysql translates certain bytes to `\<char>` i.e. `\n`. So these needs unescaping to get the correct byte length. See `getUnescapedBytesIfEscaped`
318+ // 2. PHP serialize does not convert raw bytes into `\<char>` - they're as-is, so we don't need to take into account of escaped value in byte length calculation.
319+
320+ backslash := byte ('\\' )
321+
322+ for index < len (escaped ) {
323+
324+ if escaped [index ] == backslash {
325+ unescapedBytePair := getUnescapedBytesIfEscaped (escaped [index : index + 2 ])
326+ byteLength := len (unescapedBytePair )
327+
328+ if byteLength == 1 {
329+ unescapedBytes = append (unescapedBytes , unescapedBytePair ... )
330+ index = index + 2
331+ continue
332+ }
333+ }
334+
335+ unescapedBytes = append (unescapedBytes , escaped [index ])
336+ index ++
337+ }
338+
339+ return unescapedBytes
340+ }
341+
132342func replaceAndFix (line * []byte , replacements []* Replacement ) * []byte {
133343 for _ , replacement := range replacements {
134344 if ! bytes .Contains (* line , replacement .From ) {
0 commit comments