Python Regexps by Example

Table of Contents
Yet another collection of Python Regexp examples.
Essentials
Module Import
1import re
2import pandas as pd
Search
1if re.search("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2 print('match')
3else:
4 print('no match')
match
Split
1re.split(" ","Daisy is a dachshund dog. Daisy is beautiful.")
['Daisy', 'is', 'a', 'dachshund', 'dog.', 'Daisy', 'is', 'beautiful.']
Find All
1re.findall("Daisy","Daisy is a dachshund dog. Daisy is beautiful.")
['Daisy', 'Daisy']
Iteration
1for x in re.finditer("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2 print(x.group(0) + " at " + str(x.span()))
Daisy at (0, 5)
Daisy at (26, 31)
Character Sets
Specific Characters
1re.findall("[in]","His Phone isn't 578")
['i', 'n', 'i', 'n']
Character Negation
1re.findall("[^in]","His Phone isn't 578")
['H', 's', ' ', 'P', 'h', 'o', 'e', ' ', 's', "'", 't', ' ', '5', '7', '8']
Character Ranges
1re.findall("[a-z]","His Phone isn't 578")
['i', 's', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't']
1re.findall("[A-Z]","His Phone isn't 578")
['H', 'P']
1re.findall("[0-9]","His Phone isn't 578")
['5', '7', '8']
. Dot Character is a Literal
1re.findall("[.]","His Phone isn't 578")
[]
1re.findall("[.]","His Phone isn't 578.")
['.']
Combinations
1re.findall("[a-z][sn]","His Phone isn't 578")
['is', 'on', 'is']
Meta characters
. Any Character (Except New Line)
1re.findall(".","His Phone isn't 578")
['H',
'i',
's',
' ',
'P',
'h',
'o',
'n',
'e',
' ',
'i',
's',
'n',
"'",
't',
' ',
'5',
'7',
'8']
\w Word Character (a-z, A-Z, 0-9, _)
1re.findall("\w","His Phone isn't 578")
['H', 'i', 's', 'P', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't', '5', '7', '8']
\W Not a Word Character
1re.findall("\W","His Phone isn't 578")
[' ', ' ', "'", ' ']
\d Digits 0-9
1re.findall("\d","His Phone isn't 578")
['5', '7', '8']
\D Not a Digit
1re.findall("\D","His Phone isn't 578")
['H',
'i',
's',
' ',
'P',
'h',
'o',
'n',
'e',
' ',
'i',
's',
'n',
"'",
't',
' ']
\s White space (space, new line, tab)
1re.findall("\s","His Phone isn't 578")
[' ', ' ', ' ']
\S Not Whitespace
1re.findall("\S","His Phone isn't 578")
['H',
'i',
's',
'P',
'h',
'o',
'n',
'e',
'i',
's',
'n',
"'",
't',
'5',
'7',
'8']
^ Beginning of a String
1# Match first word of the string
2re.findall("^\w+","His Phone isn't 578")
['His']
$ End of a String
1# Match last word of the string
2re.findall("\w+$","His Phone isn't 578")
['578']
\b Word Boundary
1# Match all words starting with 'i' until non-space is found
2re.findall(r"\bi\S+","His Phone isn't 578")
["isn't"]
\B Not a Word Boundary
1# Match all letters that are not at the beginning of a word
2# starting with 'i' until non-space is found
3re.findall(r"\Bi\S+","His Phone isn't 578")
['is']
Quantifiers
? Zero or One
1re.findall("A-?B","AB A-B A--B A---B")
['AB', 'A-B']
* Zero or More
1re.findall("A-*B","AB A-B A--B A---B")
['AB', 'A-B', 'A--B', 'A---B']
+ One or More
1re.findall("A-+B","AB A-B A--B A---B")
['A-B', 'A--B', 'A---B']
{n} Exactly n
1re.findall("A-{2}B","AB A-B A--B A---B")
['A--B']
{m,n} Between m and n (inclusive)
1re.findall("A-{2,3}B","AB A-B A--B A---B")
['A--B', 'A---B']
Groups and Or
| OR Operator
1# OR operator
2for x in re.finditer("(a|b)x","ax bx cx"):
3 print(x.group(0))
ax
bx
() Groups
1# the parentheses define groups
2for x in re.finditer("is a (\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3 print(x.group(1) + " - " + x.group(2))
red - car
blue - bike
1# referencing groups in substitutions
2# note that the entire pattern is being substituted but
3# that specific components are selected in the substitution
4re.sub("is a (\w+)\s(\w+)",r"is a \2 whose colour is \1","My vehicle is a red car, her vehicle is a blue bike")
'My vehicle is a car whose colour is red, her vehicle is a bike whose colour is blue'
(?:) Ignore Group
1# the first match is ignored. Only one group is returned
2for x in re.finditer("is a (?:\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3 print(x.group(1))
car
bike
Labeled Groups
1# Iteration with labels
2for x in re.finditer("(?P<name>Daisy)(?P<predicate>[\w ]*)(?P<dot>\.)","Daisy is a dachshund dog. Daisy is beautiful. She is 4."):
3 print(x.groupdict()['predicate'])
is a dachshund dog
is beautiful
Verbobe Mode (Useful with complex groups)
1# Use verbose mode
2text="""
323423\nDaisy is black, and beautiful. Taffy is brown, and short. \nOtto is cute, and happy."
4"""
5pattern="""
6(?P<name>\w*) # Alphanumerical word
7(\ is \ ) # followed by 'is'
8(?P<adj1>\w*) # followed by alphanumerical word
9(,\ and\ ) # followed by 'and'
10(?P<adj2>\w*) # followed by another alphanumerical word
11"""
12
13for item in re.finditer(pattern,text,re.VERBOSE):
14 print(item.groupdict())
{'name': 'Daisy', 'adj1': 'black', 'adj2': 'beautiful'}
{'name': 'Taffy', 'adj1': 'brown', 'adj2': 'short'}
{'name': 'Otto', 'adj1': 'cute', 'adj2': 'happy'}
(?=) Look Ahead
1# Look ahead helps create user-defined, non-consuming matches
2# 'ahead' of the expression, like '$'
3re.findall(r"\w+(?=[,!]\s?)", "One, two, three!")
['One', 'two', 'three']
(?<=) Look Behind
1# Look behind helps create user-defined non-consuming matches
2# 'behind' of the expression, like '^'
3re.findall("(?<=[Uu]n)\w+", "Undo it! It was unintentional!")
['do', 'intentional']
Look Ahead and Behind
1# Both look ahead and look behind can be combined
2re.findall("(?<=<tag>).+(?=</tag>)", "<tag>hello world</tag>")
['hello world']
Complex Examples
URL validation
1valid = "c.com b.c.com a.b.c.com "
2invalid = "c c..com b..c.com a.b..com .com"
3for x in re.finditer(r"(^|(?<=\s))\w+(\.\w+)+($|(?=\s))",valid + invalid):
4 print(x.group(0))
c.com
b.c.com
a.b.c.com
Pandas Integration
1df = pd.DataFrame(data=["01-02-2009 First Entry",
2 "05-12-2015 Second Entry",
3 "30-07-2022 Third Entry"],
4 columns=["entries"])
5df
entries | |
---|---|
0 | 01-02-2009 First Entry |
1 | 05-12-2015 Second Entry |
2 | 30-07-2022 Third Entry |
1pattern = r'(?P<day>\d{1,2})[/-](?P<month>\d{1,2})[/-](?P<year>\d{2,4})'
2df['entries'].str.extract(pattern)
day | month | year | |
---|---|---|---|
0 | 01 | 02 | 2009 |
1 | 05 | 12 | 2015 |
2 | 30 | 07 | 2022 |
Before You Leave
🤘 Subscribe to my 100% spam-free newsletter!