Python Regexps by Example

Share on:

Table of Contents

Yet another collection of Python Regexp examples.

Essentials

Module Import

1import re
1if re.search("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2    print('match')
3else:
4    print('no match')
5
match

Split

1re.split(" ","Daisy is a dachshund dog. Daisy is beautiful.")
['Daisy', 'is', 'a', 'dachshund', 'dog.', 'Daisy', 'is', 'beautiful.']

Find All

1re.findall("Daisy","Daisy is a dachshund dog. Daisy is beautiful.")
['Daisy', 'Daisy']

Iteration

1for x in re.finditer("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2    print(x.group(0) + " at " + str(x.span()))
Daisy at (0, 5)
Daisy at (26, 31)

Character Sets

Specific Characters

1re.findall("[in]","His Phone isn't 578")
['i', 'n', 'i', 'n']

Character Negation

1re.findall("[^in]","His Phone isn't 578")
['H', 's', ' ', 'P', 'h', 'o', 'e', ' ', 's', "'", 't', ' ', '5', '7', '8']

Character Ranges

1re.findall("[a-z]","His Phone isn't 578")
2
['i', 's', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't']
1re.findall("[A-Z]","His Phone isn't 578")
['H', 'P']
1re.findall("[0-9]","His Phone isn't 578")
['5', '7', '8']

. Dot Character is a Literal

1re.findall("[.]","His Phone isn't 578")
[]
1re.findall("[.]","His Phone isn't 578.")
['.']

Combinations

1re.findall("[a-z][sn]","His Phone isn't 578")
['is', 'on', 'is']

Meta characters

. Any Character (Except New Line)

1re.findall(".","His Phone isn't 578")
['H',
 'i',
 's',
 ' ',
 'P',
 'h',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 'n',
 "'",
 't',
 ' ',
 '5',
 '7',
 '8']

\w Word Character (a-z, A-Z, 0-9, _)

1re.findall("\w","His Phone isn't 578")
['H', 'i', 's', 'P', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't', '5', '7', '8']

\W Not a Word Character

1re.findall("\W","His Phone isn't 578")
[' ', ' ', "'", ' ']

\d Digits 0-9

1re.findall("\d","His Phone isn't 578")
['5', '7', '8']

\D Not a Digit

1re.findall("\D","His Phone isn't 578")
['H',
 'i',
 's',
 ' ',
 'P',
 'h',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 'n',
 "'",
 't',
 ' ']

\s White space (space, new line, tab)

1re.findall("\s","His Phone isn't 578")
[' ', ' ', ' ']

\S Not Whitespace

1re.findall("\S","His Phone isn't 578")
['H',
 'i',
 's',
 'P',
 'h',
 'o',
 'n',
 'e',
 'i',
 's',
 'n',
 "'",
 't',
 '5',
 '7',
 '8']

^ Beginning of a String

1# Match first word of the string
2re.findall("^\w+","His Phone isn't 578")
['His']

$ End of a String

1# Match last word of the string
2re.findall("\w+$","His Phone isn't 578")
['578']

\b Word Boundary

1# Match all words starting with 'i' until non-space is found
2re.findall(r"\bi\S+","His Phone isn't 578")
["isn't"]

\B Not a Word Boundary

1# Match all letters that are not at the beginning of a word 
2# starting with 'i' until non-space is found
3re.findall(r"\Bi\S+","His Phone isn't 578")
['is']

Quantifiers

? Zero or One

1re.findall("A-?B","AB A-B A--B A---B")
['AB', 'A-B']

* Zero or More

1re.findall("A-*B","AB A-B A--B A---B")
['AB', 'A-B', 'A--B', 'A---B']

+ One or More

1re.findall("A-+B","AB A-B A--B A---B")
['A-B', 'A--B', 'A---B']

{n} Exactly n

1re.findall("A-{2}B","AB A-B A--B A---B")
['A--B']

{m,n} Between m and n (inclusive)

1re.findall("A-{2,3}B","AB A-B A--B A---B")
['A--B', 'A---B']

Groups and Or

| OR Operator

1# OR operator
2for x in re.finditer("(a|b)x","ax bx cx"):
3    print(x.group(0))   
ax
bx

() Groups

1# the parentheses define groups
2for x in re.finditer("is a (\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3    print(x.group(1) + " - " + x.group(2))  
red - car
blue - bike
1# referencing groups in substitutions
2# note that the entire pattern is being substituted but
3# that specific components are selected in the substitution
4re.sub("is a (\w+)\s(\w+)",r"is a \2 whose colour is \1","My vehicle is a red car, her vehicle is a blue bike")
'My vehicle is a car whose colour is red, her vehicle is a bike whose colour is blue'

(?:) Ignore Group

1# the first match is ignored. Only one group is returned
2for x in re.finditer("is a (?:\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3    print(x.group(1))  
car
bike

Labeled Groups

1# Iteration with labels
2for x in re.finditer("(?P<name>Daisy)(?P<predicate>[\w ]*)(?P<dot>\.)","Daisy is a dachshund dog. Daisy is beautiful. She is 4."):
3    print(x.groupdict()['predicate'])
 is a dachshund dog
 is beautiful

Verbobe Mode (Useful with complex groups)

 1# Use verbose mode
 2text="""
 323423\nDaisy is black, and beautiful. Taffy is brown, and short. \nOtto is cute, and happy."
 4"""
 5pattern="""
 6(?P<name>\w*)        # Alphanumerical word
 7(\ is \ )            # followed by 'is'
 8(?P<adj1>\w*)        # followed by alphanumerical word
 9(,\ and\ )           # followed by 'and'
10(?P<adj2>\w*)        # followed by another alphanumerical word
11"""
12
13for item in re.finditer(pattern,text,re.VERBOSE):
14    print(item.groupdict())
{'name': 'Daisy', 'adj1': 'black', 'adj2': 'beautiful'}
{'name': 'Taffy', 'adj1': 'brown', 'adj2': 'short'}
{'name': 'Otto', 'adj1': 'cute', 'adj2': 'happy'}

(?=) Look Ahead

1# Look ahead helps create user-defined, non-consuming matches 
2# 'ahead' of the expression, like '$'
3re.findall(r"\w+(?=[,!]\s?)", "One, two, three!")
['One', 'two', 'three']

(?<=) Look Behind

1# Look behind helps create user-defined non-consuming matches 
2# 'behind' of the expression, like '^'
3re.findall("(?<=[Uu]n)\w+", "Undo it! It was unintentional!")
['do', 'intentional']

Look Ahead and Behind

1# Both look ahead and look behind can be combined
2re.findall("(?<=<tag>).+(?=</tag>)", "<tag>hello world</tag>")
['hello world']

Complex Examples

URL validation

1valid = "c.com b.c.com a.b.c.com "
2invalid = "c c..com b..c.com a.b..com .com"
3for x in re.finditer(r"(^|(?<=\s))\w+(\.\w+)+($|(?=\s))",valid + invalid):
4    print(x.group(0))
c.com
b.c.com
a.b.c.com