- 正则匹配技巧
- 使用python包re
re.match(正则表达式, 原字符串)
从最开始匹配,但只匹配一次。注意,这个从开始匹配,是非常严格的,只要表达式不能在字符串的开始匹配到子串,则返回Nonere.findall(正则表达式, 原字符串)
匹配字符串中所有满足正则表达式的子串re.sub(正则表达式, 替换成什么, 原字符串)
将正则表达式匹配到的部分替换成任意其他
import re
txt = 'aaa\nasdd'
findall_result = re.findall(r'.', txt)
match_result = re.match(r'.', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
['a', 'a', 'a', 'a', 's', 'd', 'd']
<_sre.SRE_Match object; span=(0, 1), match='a'>
import re
txt = 'a00daaab00aa0acb'
findall_result = re.findall(r'a*', txt)
match_result = re.match(r'a*', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
['a', '', '', '', 'aaa', '', '', '', 'aa', '', 'a', '', '', '']
<_sre.SRE_Match object; span=(0, 1), match='a'>
txt = 'a00daaab00aa0acb'
findall_result = re.findall(r'.*', txt)
match_result = re.match(r'.*', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
['a00daaab00aa0acb', '']
<_sre.SRE_Match object; span=(0, 16), match='a00daaab00aa0acb'>
import re
txt = 'a00daaab00aa0acb'
findall_result = re.findall(r'a+', txt)
match_result = re.match(r'a+', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
['a', 'aaa', 'aa', 'a']
<_sre.SRE_Match object; span=(0, 1), match='a'>
import re
txt = 'a00daaab00aa0acb'
findall_result = re.findall(r'[ab0]', txt)
match_result = re.match(r'[cba]', txt)
match_result2 = re.match(r'[cb]', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
print(match_result2)
['a', '0', '0', 'a', 'a', 'a', 'b', '0', '0', 'a', 'a', '0', 'a', 'b']
<_sre.SRE_Match object; span=(0, 1), match='a'>
None
5. ()
标识一个子表达式的范围,可用(表达式1|表达式2)
同时进行多个表达式的匹配,匹配其中的任意一个。另外,注意表达式有()
时,例如(aa)bc
,将只能匹配字符串中caabcnaadd
中的aa
(即函数只返回()
里的部分)。有多个()
,则分别返回每个括号匹配到的值
import re
txt = 'a00daaab00aa0acb'
findall_result = re.findall(r'(da|a0)aa', txt)
findall_result2 = re.findall(r'(da)(aa)', txt)
findall_result3 = re.findall(r'(da)(ab)', txt)
match_result = re.match(r'(b0|a0)', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(findall_result2)
print(findall_result3)
print(match_result)
['da']
[('da', 'aa')]
[]
<_sre.SRE_Match object; span=(0, 2), match='a0'>
import re
txt = 'a00d*aa\nab00aa0acb\n'
findall_result = re.findall(r'\*', txt)
match_result = re.match(r'.*(\*)', txt)
match_result2 = re.match(r'\*', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(match_result)
print(match_result2)
['*']
<_sre.SRE_Match object; span=(0, 5), match='a00d*'>
None
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.findall(r'a.*?b', txt)
findall_result2 = re.findall(r'a(.*?)b', txt)
match_result = re.match(r'a.*?b', txt)
match_result2 = re.match(r'.*', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(findall_result2)
print(match_result)
print(match_result2)
['a00da*aab', 'aa0acb']
['00da*aa', 'a0ac']
<_sre.SRE_Match object; span=(0, 9), match='a00da*aab'>
<_sre.SRE_Match object; span=(0, 17), match='a00da*aab00aa0acb'>
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.findall(r'a0{2}', txt)
findall_result2 = re.findall(r'a0{1}', txt)
match_result = re.match(r'a{2}', txt)
match_result2 = re.match(r'a{1}', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(findall_result2)
print(match_result)
print(match_result2)
['a00']
['a0', 'a0']
None
<_sre.SRE_Match object; span=(0, 1), match='a'>
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.findall(r'^a0', txt)
findall_result2 = re.findall(r'a0', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(findall_result2)
['a0']
['a0', 'a0']
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.findall(r'[^0]', txt)
findall_result2 = re.findall(r'[0]', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
print(findall_result2)
['a', 'd', 'a', '*', 'a', 'a', 'b', 'a', 'a', 'a', 'c', 'b']
['0', '0', '0', '0', '0']
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.findall(r'a(.*?)b', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result)
['00da*aa', 'a0ac']
12. **高难度用法**:(1号表达式)(2号表达式)(3号表达式)
,这样用()
括号,将不同正则表达式括起来后,对字符串进行匹配后,可以用re.group(1或2或3)
单独获取到三个表达式对应匹配到的字符串。(re.group(0)
或者不使用group
则获取整个全局的正则匹配)
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.search(r'(a.*a)(b.*a)(0.*b)', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result.group(1))
print(findall_result.group(2))
print(findall_result.group(3))
a00da*aa
b00aa
0acb
import re
txt = 'a00da*aab00aa0acb'
findall_result = re.search(r'(?P<ZhangZhe>d.*)00', txt)
# findall返回一个list
# match返回一个对象,其内容为表达式在字符串开头能匹配到的部分
print(findall_result.group('ZhangZhe'))
da*aab
from string import punctuation as en_punc
print(en_punc)
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~