Python3中的正则

Python Python3中的正则

python3中使用正则，须用re模块。

常用方法：
re.match() 从目标字符串第一个字符开始匹配，第一个字符不匹配就失败
re.search() 从目标字符串中查找匹配结果，不限位置。但仅返回一个成功结果
re.findall() 匹配全部结果
re.sub() 如果匹配到了，替换之
re.split() 如果匹配到了，用匹配项切分目标字符串

import re  # 导入库

# re.match()
#     从字符串的起始位置匹配。即目标字符串的第一个字符必须符合规则。
#     如果不是起始位置匹配成功的话，match()就返回none。
#     得到一个结果即返回。

# 匹配单个字符
#    ?  左边第一个字符可选
#    .  任意一个字符（除了\n）
#    [] 此列表中某1个字符
#    \d 0-9某1个数字
#    \D 不是数字的某个字符
#    \s 空格或tab制表符
#    \S 非空格、非制表符的某个字符
#    \w 常规字符：a-z、A-Z、0-9、_ 共计26+26+10+1个字符中的某一个，及汉字(python3)
#    \W 除了a-z、A-Z、0-9、_ 之外的非常规字符
hasPython = re.match(r"hello?", "hello") # 匹配“hello”这个字符串，最后的字符‘o’可有可无
print(hasPython.group() if hasPython else None)  # hello
hasPython = re.match(r"hello?", "hell")  # 'o'可选
print(hasPython.group() if hasPython else None)  # hell

hasPython = re.match(r"h.llo", "hello")  # 匹配一个字符串，首字符是‘h’，第2个字符随便是什么，后面是‘llo’
print(hasPython.group() if hasPython else None)  # hello

hasPython = re.match(r"h[a-f]llo", "hello")  # 第2个字符是 字母a到字母f中的某一个
print(hasPython.group() if hasPython else None)  # hello
hasPython = re.match(r"h[a-f1-9ABCDE]llo", "hEllo") # 第2个字符是 字母a到字母f中的某一个，或 数字1到9中的某一个，或 ‘A’‘B’‘C’‘D’‘E’中的某一个
print(hasPython.group() if hasPython else None)  # hEllo

hasPython = re.match(r"hello\d", "hello98")  # 匹配一个字符串，‘hello’后紧跟1个数字.
print(hasPython.group() if hasPython else None)  # hello9
hasPython = re.match(r"hello\D", "hello九")  # 匹配一个字符串，‘hello’后紧跟1个非数字.
print(hasPython.group() if hasPython else None)  # hello九

hasPython = re.match(r"he\sllo", "he llo")  # 匹配一个字符串 ‘he’和‘llo’中间有个空格
print(hasPython.group() if hasPython else None)  # he llo

hasPython = re.match(r"hello\w", "hello_9") # 匹配一个字符串，‘hello’后紧跟1个下划线
print(hasPython.group() if hasPython else None)  # hello_ 


# 匹配多个字符
#    {3}    限制左侧第一个字符的数量须是3个
#    {1, 3} 限制左侧第一个字符的数量在1到3个
#    *      表示左侧第一个字符可以有任意个数
#    +      限制左侧第一个字符至少出现一次
hasPython = re.match(r"报警电话\d{3}\D", "报警电话110正确，报警电话110987错误") # 限制数字必须是3个
print(hasPython.group() if hasPython else None)  # 报警电话110正

hasPython = re.match(r"数字\d{1,5}", "数字，还有1234567890") # 匹配字符串，‘数字’后面有1到5个数字
print(hasPython.group() if hasPython else None)  # None。最少1个数字才算匹配
hasPython = re.match(r"数字\d{1,5}", "数字123，还有4567890") 
print(hasPython.group() if hasPython else None)  # 数字123
hasPython = re.match(r"数字\d{1,5}", "数字123456，还有7890")
print(hasPython.group() if hasPython else None) # 数字12345。数字数量大于5但最多取5个

hasPython = re.match(r"[A-Z]\d{1,2}-\d{3}", "D3-526是门牌号")  # 第一个字符是A-Z中某个，然后是一个个位或十位的数字，然后是减号，最后是3位的数字
print(hasPython.group() if hasPython else None)  # D3-526
hasPython = re.match(r"\w*-\d{3}", "门牌号是D3-526") # 减号前面任意个字符，后面3个数字
print(hasPython.group() if hasPython else None)  # 门牌号是D3-526
hasPython = re.match(r"\w+-\d{3}", "门牌号是D3-526") # 减号前面至少1个字符
print(hasPython.group() if hasPython else None)  # 门牌号是D3-526


# 匹配开头结尾
#    ^  表示从目标字符串开头开始匹配。re.match方法默认从头开始判断
#    $  表示条件匹配到目标字符串结尾
hasPython = re.match(r"[a-z]*$", "hello python !") # 从头到尾都是a-z中的字符，个数任意
print(hasPython.group() if hasPython else None)  # None
hasPython = re.match(r"[a-zA-Z]*$", "HelloPython") # 
print(hasPython.group() if hasPython else None)  # HelloPython
hasPython = re.match(r"^[a-zA-Z]*$", "HelloPython") # ^ 可以省略
print(hasPython.group() if hasPython else None)  # HelloPython


# 转义
#    \ 使用斜线转义占位符为普通字符
hasPython = re.match(r"[a-zA-Z0-9]{4,20}@[a-zA-Z0-9]{2,20}\.com$", "qqppbb@163.com") # 转义‘点’
print(hasPython.group() if hasPython else None)  # qqppbb@163.com


# 匹配分组 
#    (|)  多个匹配项，在小括号-元组-内使用竖杠分隔
#    ()   小括号也可以为匹配规则分组
hasPython = re.match(r"我喜欢(java|python|js)", "我喜欢python") #  小括号-元组-内竖杠分隔。本例只有一组小括号
print(hasPython.group() if hasPython else None)   # 我喜欢python
print(hasPython.group(1) if hasPython else None)  # python。 group(1)方法传参1返回匹配到的元组内的项
hasPython = re.match(r"([a-zA-Z0-9]{4,20})@([a-zA-Z0-9]{2,20})\.com$", "qqppbb@163.com") # @符号前放在一组小括号内，@符号至点中间的放在第2个小括号内
print(hasPython.group(1) if hasPython else None)  # qqppbb。 取第一个括号内匹配到的数据


html = """<h1>python正则大法好</h1>
<h1>python正则大法好</h2> 【a】
<h3>python正则大法好</h3>
<p>
<b>啊，python3，正则难</b>
<b>啊，Java，正则难</b>
--.--
数字 age = 90
;--
</p>"""

# 匹配分组-分组命名
#    (?P<名称>匹配规则)  为匹配规则命名
#    (?P=名称)          调用匹配规则
p = r"<h[1-6]>.+</h[1-6]>"    # [a]这里会出现 <h1>python正则大法好</h2> 的错误结果
p = r"<(h[1-6])>.+</\1>"      # 将规则中重复内容封装在一个组内(h[1-6])，后面的根据组顺序调用\1
p = r"<(?P<title>h[1-6])>.+</(?P=title)>"  # 为规则组设置一个名称title，后面使用名称调用，完成封闭匹配
hasPython = re.match(p, html, re.S)  # re.S支持换行。见下文
print(hasPython.group() if hasPython else None)  # <h1>python正则大法好</h1>


# re高级用法
#    re.I	使匹配对大小写不敏感
#    re.L	做本地化识别（locale-aware）匹配
#    re.M	多行匹配，影响 ^ 和 $
#    re.S	使 . 点 匹配包括换行在内的所有字符
#    re.U	根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.
#    re.X	该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。

hasPython = re.match(r".*", html, re.S)
print(hasPython.group() if hasPython else None)  # html全文
hasPython = re.match(r".*", "")
print(hasPython.group() if hasPython else None)  # 返回空白。*表示任意数量，0即没有也匹配

# re.search()
#     从字符串任意位置匹配。得到第1个即返回。
hasPython = re.search(r"啊，(.+)，正则难", html)
print(hasPython.group(1) if hasPython else None) # python3

# re.findall()
#     从字符串任意位置匹配。获取全部匹配项。
hasPython = re.findall(r"啊，(.+)，正则难", html)
print(hasPython if hasPython else None) # ['python3', 'Java']

# re.sub()
#     python特有
#     如果匹配到了，就将匹配到的字符串替换掉。
#     替换全部的匹配项
newHtml = re.sub(r"啊，(.+)，正则难", "C++", html) # 规则，替换用的字符串，原字符串
print(newHtml)  # 返回原html字符串，但
                #   将 <b>啊，python3，正则难</b>
                #      <b>啊，Java，正则难</b>
                #   两句替换为了 <b>C++</b>
                #              <b>C++</b>

def getReplace(language):
    return language + " NB"

newHtml = re.sub(r"啊，(.+)，正则难", getReplace("C++"), html) # 规则，替换用的字符串，原字符串
print(newHtml)  # ... 原文 ...
                #     <b>C++ NB</b>
                #     <b>C++ NB</b>
                # ... 原文 ...

def changeNumber(tmp):
    age = tmp.group(1) # 这里取到的仅仅是规则组内容，即目标数字
    age = int(age) + 1
    return "age = " + str(age)

newHtml = re.sub(r"age = (\d+)", changeNumber, html) # 90是两位数字，用+
print(newHtml)  # ... 原文 ...
                #     数字 age = 91
                # ... 原文 ...

# re.split()
#     按规则找到匹配后，用匹配项切分目标字符串
lst = re.split(r"\.", "威格灵博客 http://www.gaohaiyan.com")  # 仅使用‘点’切分
print(lst) # ['威格灵博客 http://www', 'gaohaiyan', 'com']
lst = re.split(r":|\.| ", "威格灵博客 http://www.gaohaiyan.com") # 冒号、点、空格，都切分
print(lst) # ['威格灵博客', 'http', '//www', 'gaohaiyan', 'com']
lst = re.split(r"\W{3}", "威格灵博客 http://www.gaohaiyan.com") # 三个连续非常规字符
print(lst) # ['威格灵博客 http', 'www.gaohaiyan.com']

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

import re # 导入库

# re.match()

# 从字符串的起始位置匹配。即目标字符串的第一个字符必须符合规则。

# 如果不是起始位置匹配成功的话，match()就返回none。

# 得到一个结果即返回。

# 匹配单个字符

# ? 左边第一个字符可选

# . 任意一个字符（除了\n）

# [] 此列表中某1个字符

# \d 0-9某1个数字

# \D 不是数字的某个字符

# \s 空格或tab制表符

# \S 非空格、非制表符的某个字符

# \w 常规字符：a-z、A-Z、0-9、_ 共计26+26+10+1个字符中的某一个，及汉字(python3)

# \W 除了a-z、A-Z、0-9、_ 之外的非常规字符

hasPython = re.match(r"hello?", "hello") # 匹配“hello”这个字符串，最后的字符‘o’可有可无

print(hasPython.group() if hasPython else None) # hello

hasPython = re.match(r"hello?", "hell") # 'o'可选

print(hasPython.group() if hasPython else None) # hell

hasPython = re.match(r"h.llo", "hello") # 匹配一个字符串，首字符是‘h’，第2个字符随便是什么，后面是‘llo’

print(hasPython.group() if hasPython else None) # hello

hasPython = re.match(r"h[a-f]llo", "hello") # 第2个字符是字母a到字母f中的某一个

print(hasPython.group() if hasPython else None) # hello

hasPython = re.match(r"h[a-f1-9ABCDE]llo", "hEllo") # 第2个字符是字母a到字母f中的某一个，或数字1到9中的某一个，或 ‘A’‘B’‘C’‘D’‘E’中的某一个

print(hasPython.group() if hasPython else None) # hEllo

hasPython = re.match(r"hello\d", "hello98") # 匹配一个字符串，‘hello’后紧跟1个数字.

print(hasPython.group() if hasPython else None) # hello9

hasPython = re.match(r"hello\D", "hello九") # 匹配一个字符串，‘hello’后紧跟1个非数字.

print(hasPython.group() if hasPython else None) # hello九

hasPython = re.match(r"he\sllo", "he llo") # 匹配一个字符串 ‘he’和‘llo’中间有个空格

print(hasPython.group() if hasPython else None) # he llo

hasPython = re.match(r"hello\w", "hello_9") # 匹配一个字符串，‘hello’后紧跟1个下划线

print(hasPython.group() if hasPython else None) # hello_

# 匹配多个字符

# {3} 限制左侧第一个字符的数量须是3个

# {1, 3} 限制左侧第一个字符的数量在1到3个

# * 表示左侧第一个字符可以有任意个数

# + 限制左侧第一个字符至少出现一次

hasPython = re.match(r"报警电话\d{3}\D", "报警电话110正确，报警电话110987错误") # 限制数字必须是3个

print(hasPython.group() if hasPython else None) # 报警电话110正

hasPython = re.match(r"数字\d{1,5}", "数字，还有1234567890") # 匹配字符串，‘数字’后面有1到5个数字

print(hasPython.group() if hasPython else None) # None。最少1个数字才算匹配

hasPython = re.match(r"数字\d{1,5}", "数字123，还有4567890")

print(hasPython.group() if hasPython else None) # 数字123

hasPython = re.match(r"数字\d{1,5}", "数字123456，还有7890")

print(hasPython.group() if hasPython else None) # 数字12345。数字数量大于5但最多取5个

hasPython = re.match(r"[A-Z]\d{1,2}-\d{3}", "D3-526是门牌号") # 第一个字符是A-Z中某个，然后是一个个位或十位的数字，然后是减号，最后是3位的数字

print(hasPython.group() if hasPython else None) # D3-526

hasPython = re.match(r"\w*-\d{3}", "门牌号是D3-526") # 减号前面任意个字符，后面3个数字

print(hasPython.group() if hasPython else None) # 门牌号是D3-526

hasPython = re.match(r"\w+-\d{3}", "门牌号是D3-526") # 减号前面至少1个字符

print(hasPython.group() if hasPython else None) # 门牌号是D3-526

# 匹配开头结尾

# ^ 表示从目标字符串开头开始匹配。re.match方法默认从头开始判断

# $ 表示条件匹配到目标字符串结尾

hasPython = re.match(r"[a-z]*$", "hello python !") # 从头到尾都是a-z中的字符，个数任意

print(hasPython.group() if hasPython else None) # None

hasPython = re.match(r"[a-zA-Z]*$", "HelloPython") #

print(hasPython.group() if hasPython else None) # HelloPython

hasPython = re.match(r"^[a-zA-Z]*$", "HelloPython") # ^ 可以省略

print(hasPython.group() if hasPython else None) # HelloPython

# 转义

# \ 使用斜线转义占位符为普通字符

hasPython = re.match(r"[a-zA-Z0-9]{4,20}@[a-zA-Z0-9]{2,20}\.com$", "qqppbb@163.com") # 转义‘点’

print(hasPython.group() if hasPython else None) # qqppbb@163.com

# 匹配分组

# (|) 多个匹配项，在小括号-元组-内使用竖杠分隔

# () 小括号也可以为匹配规则分组

hasPython = re.match(r"我喜欢(java|python|js)", "我喜欢python") # 小括号-元组-内竖杠分隔。本例只有一组小括号

print(hasPython.group() if hasPython else None) # 我喜欢python

print(hasPython.group(1) if hasPython else None) # python。 group(1)方法传参1返回匹配到的元组内的项

hasPython = re.match(r"([a-zA-Z0-9]{4,20})@([a-zA-Z0-9]{2,20})\.com$", "qqppbb@163.com") # @符号前放在一组小括号内，@符号至点中间的放在第2个小括号内

print(hasPython.group(1) if hasPython else None) # qqppbb。取第一个括号内匹配到的数据

html = """<h1>python正则大法好</h1>

<h1>python正则大法好</h2> 【a】

<h3>python正则大法好</h3>

<p>

<b>啊，python3，正则难</b>

--.--

数字 age = 90

;--

</p>"""

# 匹配分组-分组命名

# (?P<名称>匹配规则) 为匹配规则命名

# (?P=名称) 调用匹配规则

p = r"<h[1-6]>.+</h[1-6]>" # [a]这里会出现 <h1>python正则大法好</h2> 的错误结果

p = r"<(h[1-6])>.+</\1>" # 将规则中重复内容封装在一个组内(h[1-6])，后面的根据组顺序调用\1

p = r"<(?P<title>h[1-6])>.+</(?P=title)>" # 为规则组设置一个名称title，后面使用名称调用，完成封闭匹配

hasPython = re.match(p, html, re.S) # re.S支持换行。见下文

print(hasPython.group() if hasPython else None) # <h1>python正则大法好</h1>

# re高级用法

# re.I 使匹配对大小写不敏感

# re.L 做本地化识别（locale-aware）匹配

# re.M 多行匹配，影响 ^ 和 $

# re.S 使 . 点匹配包括换行在内的所有字符

# re.U 根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.

# re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。

hasPython = re.match(r".*", html, re.S)

print(hasPython.group() if hasPython else None) # html全文

hasPython = re.match(r".*", "")

print(hasPython.group() if hasPython else None) # 返回空白。*表示任意数量，0即没有也匹配

# re.search()

# 从字符串任意位置匹配。得到第1个即返回。

hasPython = re.search(r"啊，(.+)，正则难", html)

print(hasPython.group(1) if hasPython else None) # python3

# re.findall()

# 从字符串任意位置匹配。获取全部匹配项。

hasPython = re.findall(r"啊，(.+)，正则难", html)

print(hasPython if hasPython else None) # ['python3', 'Java']

# re.sub()

# python特有

# 如果匹配到了，就将匹配到的字符串替换掉。

# 替换全部的匹配项

newHtml = re.sub(r"啊，(.+)，正则难", "C++", html) # 规则，替换用的字符串，原字符串

print(newHtml) # 返回原html字符串，但

# 将 <b>啊，python3，正则难</b>

# <b>啊，Java，正则难</b>

# 两句替换为了 <b>C++</b>

# <b>C++</b>

def getReplace(language):

return language + " NB"

newHtml = re.sub(r"啊，(.+)，正则难", getReplace("C++"), html) # 规则，替换用的字符串，原字符串

print(newHtml) # ... 原文 ...

# <b>C++ NB</b>

# ... 原文 ...

def changeNumber(tmp):

age = tmp.group(1) # 这里取到的仅仅是规则组内容，即目标数字

age = int(age) + 1

return "age = " + str(age)

newHtml = re.sub(r"age = (\d+)", changeNumber, html) # 90是两位数字，用+

print(newHtml) # ... 原文 ...

# 数字 age = 91

# ... 原文 ...

# re.split()

# 按规则找到匹配后，用匹配项切分目标字符串

lst = re.split(r"\.", "威格灵博客 http://www.gaohaiyan.com") # 仅使用‘点’切分

print(lst) # ['威格灵博客 http://www', 'gaohaiyan', 'com']

lst = re.split(r":|\.| ", "威格灵博客 http://www.gaohaiyan.com") # 冒号、点、空格，都切分

print(lst) # ['威格灵博客', 'http', '//www', 'gaohaiyan', 'com']

lst = re.split(r"\W{3}", "威格灵博客 http://www.gaohaiyan.com") # 三个连续非常规字符

print(lst) # ['威格灵博客 http', 'www.gaohaiyan.com']

-end

声明

本文由 cuiweiyou 原创，转载请注明出处：http://www.gaohaiyan.com/2665.html

承接App定制、企业web站点、办公系统软件设计开发，外包项目，毕设

Android自定义组合控件
Matplotlib可视化示例代码-2-绘制子图
Android控件样式style和界面/应用主题theme
threejs_r132.光源示例