Python Regexps by Example

Jan 9, 2022 data coding python

Yet another collection of Python Regexp examples.

Essentials

Module Import

1import re
2import pandas as pd

Search

1if re.search("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2    print('match')
3else:
4    print('no match')

match

Split

1re.split(" ","Daisy is a dachshund dog. Daisy is beautiful.")

['Daisy', 'is', 'a', 'dachshund', 'dog.', 'Daisy', 'is', 'beautiful.']

Find All

1re.findall("Daisy","Daisy is a dachshund dog. Daisy is beautiful.")

['Daisy', 'Daisy']

Iteration

1for x in re.finditer("Daisy","Daisy is a dachshund dog. Daisy is beautiful."):
2    print(x.group(0) + " at " + str(x.span()))

Daisy at (0, 5)
Daisy at (26, 31)

Character Sets

Specific Characters

1re.findall("[in]","His Phone isn't 578")

['i', 'n', 'i', 'n']

Character Negation

1re.findall("[^in]","His Phone isn't 578")

['H', 's', ' ', 'P', 'h', 'o', 'e', ' ', 's', "'", 't', ' ', '5', '7', '8']

Character Ranges

1re.findall("[a-z]","His Phone isn't 578")

['i', 's', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't']

1re.findall("[A-Z]","His Phone isn't 578")

['H', 'P']

1re.findall("[0-9]","His Phone isn't 578")

['5', '7', '8']

. Dot Character is a Literal

1re.findall("[.]","His Phone isn't 578")

[]

1re.findall("[.]","His Phone isn't 578.")

['.']

Combinations

1re.findall("[a-z][sn]","His Phone isn't 578")

['is', 'on', 'is']

Meta characters

. Any Character (Except New Line)

1re.findall(".","His Phone isn't 578")

['H',
 'i',
 's',
 ' ',
 'P',
 'h',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 'n',
 "'",
 't',
 ' ',
 '5',
 '7',
 '8']

\w Word Character (a-z, A-Z, 0-9, _)

1re.findall("\w","His Phone isn't 578")

['H', 'i', 's', 'P', 'h', 'o', 'n', 'e', 'i', 's', 'n', 't', '5', '7', '8']

\W Not a Word Character

1re.findall("\W","His Phone isn't 578")

[' ', ' ', "'", ' ']

\d Digits 0-9

1re.findall("\d","His Phone isn't 578")

['5', '7', '8']

\D Not a Digit

1re.findall("\D","His Phone isn't 578")

['H',
 'i',
 's',
 ' ',
 'P',
 'h',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 'n',
 "'",
 't',
 ' ']

\s White space (space, new line, tab)

1re.findall("\s","His Phone isn't 578")

[' ', ' ', ' ']

\S Not Whitespace

1re.findall("\S","His Phone isn't 578")

['H',
 'i',
 's',
 'P',
 'h',
 'o',
 'n',
 'e',
 'i',
 's',
 'n',
 "'",
 't',
 '5',
 '7',
 '8']

^ Beginning of a String

1# Match first word of the string
2re.findall("^\w+","His Phone isn't 578")

['His']

$ End of a String

1# Match last word of the string
2re.findall("\w+$","His Phone isn't 578")

['578']

\b Word Boundary

1# Match all words starting with 'i' until non-space is found
2re.findall(r"\bi\S+","His Phone isn't 578")

["isn't"]

\B Not a Word Boundary

1# Match all letters that are not at the beginning of a word 
2# starting with 'i' until non-space is found
3re.findall(r"\Bi\S+","His Phone isn't 578")

['is']

Quantifiers

? Zero or One

1re.findall("A-?B","AB A-B A--B A---B")

['AB', 'A-B']

* Zero or More

1re.findall("A-*B","AB A-B A--B A---B")

['AB', 'A-B', 'A--B', 'A---B']

+ One or More

1re.findall("A-+B","AB A-B A--B A---B")

['A-B', 'A--B', 'A---B']

{n} Exactly n

1re.findall("A-{2}B","AB A-B A--B A---B")

['A--B']

{m,n} Between m and n (inclusive)

1re.findall("A-{2,3}B","AB A-B A--B A---B")

['A--B', 'A---B']

Groups and Or

| OR Operator

1# OR operator
2for x in re.finditer("(a|b)x","ax bx cx"):
3    print(x.group(0))

ax
bx

() Groups

1# the parentheses define groups
2for x in re.finditer("is a (\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3    print(x.group(1) + " - " + x.group(2))

red - car
blue - bike

1# referencing groups in substitutions
2# note that the entire pattern is being substituted but
3# that specific components are selected in the substitution
4re.sub("is a (\w+)\s(\w+)",r"is a \2 whose colour is \1","My vehicle is a red car, her vehicle is a blue bike")

'My vehicle is a car whose colour is red, her vehicle is a bike whose colour is blue'

(?:) Ignore Group

1# the first match is ignored. Only one group is returned
2for x in re.finditer("is a (?:\w+) (\w+)","My vehicle is a red car, her vehicle is a blue bike"):
3    print(x.group(1))

car
bike

Labeled Groups

1# Iteration with labels
2for x in re.finditer("(?P<name>Daisy)(?P<predicate>[\w ]*)(?P<dot>\.)","Daisy is a dachshund dog. Daisy is beautiful. She is 4."):
3    print(x.groupdict()['predicate'])

 is a dachshund dog
 is beautiful

Verbobe Mode (Useful with complex groups)

 1# Use verbose mode
 2text="""
 323423\nDaisy is black, and beautiful. Taffy is brown, and short. \nOtto is cute, and happy."
 4"""
 5pattern="""
 6(?P<name>\w*)        # Alphanumerical word
 7(\ is \ )            # followed by 'is'
 8(?P<adj1>\w*)        # followed by alphanumerical word
 9(,\ and\ )           # followed by 'and'
10(?P<adj2>\w*)        # followed by another alphanumerical word
11"""
12
13for item in re.finditer(pattern,text,re.VERBOSE):
14    print(item.groupdict())

{'name': 'Daisy', 'adj1': 'black', 'adj2': 'beautiful'}
{'name': 'Taffy', 'adj1': 'brown', 'adj2': 'short'}
{'name': 'Otto', 'adj1': 'cute', 'adj2': 'happy'}

(?=) Look Ahead

1# Look ahead helps create user-defined, non-consuming matches 
2# 'ahead' of the expression, like '$'
3re.findall(r"\w+(?=[,!]\s?)", "One, two, three!")

['One', 'two', 'three']

(?<=) Look Behind

1# Look behind helps create user-defined non-consuming matches 
2# 'behind' of the expression, like '^'
3re.findall("(?<=[Uu]n)\w+", "Undo it! It was unintentional!")

['do', 'intentional']

Look Ahead and Behind

1# Both look ahead and look behind can be combined
2re.findall("(?<=<tag>).+(?=</tag>)", "<tag>hello world</tag>")

['hello world']

Complex Examples

URL validation

1valid = "c.com b.c.com a.b.c.com "
2invalid = "c c..com b..c.com a.b..com .com"
3for x in re.finditer(r"(^|(?<=\s))\w+(\.\w+)+($|(?=\s))",valid + invalid):
4    print(x.group(0))

c.com
b.c.com
a.b.c.com

Pandas Integration

1df = pd.DataFrame(data=["01-02-2009 First Entry",
2                        "05-12-2015 Second Entry",
3                        "30-07-2022 Third Entry"],
4                        columns=["entries"])
5df

	entries
0	01-02-2009 First Entry
1	05-12-2015 Second Entry
2	30-07-2022 Third Entry

1pattern = r'(?P<day>\d{1,2})[/-](?P<month>\d{1,2})[/-](?P<year>\d{2,4})'
2df['entries'].str.extract(pattern)

	day	month	year
0	01	02	2009
1	05	12	2015
2	30	07	2022

Python Regexps by Example

Table of Contents

Essentials

Module Import

Search

Split

Find All

Iteration

Character Sets

Specific Characters

Character Negation

Character Ranges

. Dot Character is a Literal

Combinations

Meta characters

. Any Character (Except New Line)

\w Word Character (a-z, A-Z, 0-9, _)

\W Not a Word Character

\d Digits 0-9

\D Not a Digit

\s White space (space, new line, tab)

\S Not Whitespace

^ Beginning of a String

$ End of a String

\b Word Boundary

\B Not a Word Boundary

Quantifiers

? Zero or One

* Zero or More

+ One or More

{n} Exactly n

{m,n} Between m and n (inclusive)

Groups and Or

| OR Operator

() Groups

(?:) Ignore Group

Labeled Groups

Verbobe Mode (Useful with complex groups)

(?=) Look Ahead

(?<=) Look Behind

Look Ahead and Behind

Complex Examples

URL validation

Pandas Integration