- Sat 03 February 2018
- python
Python Regular Expression snippets¶
Small snippet on how to use regular expression in python
In [111]:
# Load regex package
import re
building a RegexObject¶
Compiling a pattern into RegexObjects can save running time especially inside class.
In [112]:
pattern = r'123'
reg1 = re.compile(pattern)
pattern = 'abc'
reg2 = re.compile(pattern, flags=re.I|re.X)
Flags¶
flag | meaning |
---|---|
re.I | (ignore case), |
re.M | (multi-line), |
re.S | (dot matches all including \n), |
re.L | (locale dependent) (defining the meaning of white spaces \w, \W, \b, \B, \d, \D, \s, \S ) |
re.A | (ASCII-only matching) |
re.U | (Unicode matching) |
re.X | (verbose) |
Search vs. Match¶
match: find something at the beginning of the string and return a match object.
search: find something anywhere in the string and return a match object.
So if you need to match at the beginning of the string, or to match the entire string use match. It is faster. Otherwise use search.
Note search will only replace one result, if you need all of them use finditer and iterate over them.
If you don't care about the matchObject (e.g. spans) and wants the string only you can use findall .
fullmatch is also available but cares both about start and end.
In [113]:
test = '123abc123abc123abc'
print('reg1 match', reg1.match(test))
print('reg1 search',reg1.search(test))
print('reg2 match', reg2.match(test)) # no match
print('reg2 search',reg2.search(test))
print('-'*10)
for match in reg1.finditer(test): # Returns all matches
print(match)
print('-'*10)
print(reg1.findall(test)) # list of strings
patterns¶
. Match any char except newline
^ Match start of the string (if can be start of new line re.M flag on search (not match) )
$ Match end of the string
\ escape special char , inside a pattern needed for " . ^ $ [ ] | ( ) \ and inside a [ ] needed for ^ - ] . You can also re.escape('test.')
[ ] Enclose a set of matchable chars
[^ ] negate a set of matchable chars
[0-9A-Fa-f] a set ranges
R|S Match either regex R or regex S.
() Create capture group
(?P...) capturing group with names
(?:...) non-capturing group
(?#...) comment
(?!...) negative lookahead
Quantifiers
{m} Exactly m repetitions
{m,n} From m (default 0) to n (default infinity)
* 0 or more. Same as {,}
+ 1 or more. Same as {1,}
? 0 or 1. Same as {,1}
Special sequences
make sure you use "\b" or r'\b'
\A Start of string
\b Match empty string at word (\w+) boundary
\B Match empty string not at word boundary
\d Digit
\D Non-digit
\s Whitespace [ \t\n\r\f\v], see LOCALE,UNICODE
\S Non-whitespace
\w Alphanumeric: [0-9a-zA-Z_], see LOCALE
\W Non-alphanumeric
\Z End of string
In [114]:
# Capturing groups
## get by name
p1 = re.compile('(23)(?Pabc)' )
m = p1.search(test)
print('group:', m.group('name'))
print(m.groupdict()) # returns dictionary
print('-'*10)
# access to numbered groups
line = "Cats are smarter than dogs"
matchObj = re.match( r'(.*) are (.*?) .*', line, re.M|re.I)
if matchObj:
# the whole match not just capturing groups same as group(0)
print ("matchObj.group() : ", matchObj.group())
print ("matchObj.group span: ", matchObj.span())
# capturing group 1
print ("matchObj.group(1) : ", matchObj.group(1))
# capturing group 2
print ("matchObj.group(2) : ", matchObj.group(2))
else:
print ("No match!!")
# in case of using .sub we can use \1 , \2 , ... to refrence captured groups or \g
print(p1.sub(r'(\1)', test))
print(p1.sub('XX\gYY' , test))
# re.subn performs the same operation as sub(),
# but return a tuple (new_string, number_of_subs_made).
In [115]:
# Splitting by regex
mystring = '1. First part 2. Second part 3. Third part'
re.split(r'\d\.', mystring)
Out[115]:
Examples¶
In [116]:
# detect numbers
pattern = '[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0'
tests = ['0', '1', '99', '800', '4,002', '4002', '2.2', '0.0']
for test in tests:
print(re.match(pattern,test).string)
In [117]:
# detect emails
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-.]+\.[a-zA-Z0-9-]+)"
tests = ['test@hotmail.com',
'user@test.gc.ca',
'I am at test@test.profile.']
for test in tests:
print(re.search(pattern,test).group(1))
In [118]:
# detect urls
pattern = '(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?'
tests = ['http://test.com']
for test in tests:
print(re.search(pattern,test).group())
In [119]:
# validate usernames or password
# min 3, max 16 allowed lowercase letters, numbers and _ and -
pattern = '^[a-z0-9_-]{3,16}$'
validator = re.compile(pattern)
good_example = 'my-us3r_n4m3'
bad_example = 'my-us3r.n4m3'
print('good_example', validator.match(good_example))
print('bad_example', validator.match(bad_example))
In [120]:
# validate hex value
pattern = '^#?([a-f0-9]{6}|[a-f0-9]{3})$'
validator = re.compile(pattern)
good_example = '#a3c113'
bad_example = '#4d82h4'
print('good_example', validator.match(good_example))
print('bad_example', validator.match(bad_example))
In [121]:
# validate ip
pattern = '^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
validator = re.compile(pattern)
good_example = '73.60.124.136'
bad_example = '256.60.124.136'
print('good_example', validator.match(good_example))
print('bad_example', validator.match(bad_example))
In [122]:
# remove all tags
test = 'Title '
print(re.sub('<[^>]*>', '', test))
# detect urls
test2 = """
"""pattern = r']*href=\"([^>]*)\"[^>]*>(.*?)'
print(re.findall(pattern, test2))
In [123]:
# find words with prefix
text = 'Literally, I’m really hungry now. Actually, I didn’t have a lot for breakfast.'
adverbs = re.findall(r'\b\w*ly\b', text)
print(adverbs)
In [124]:
# validate US phone numbers
pattern = '\(?([2-9][0-8][0-9])\)?[-.● ]?([2-9][0-9]{2})[-.●]?([0-9]{4})'
validator = re.compile(pattern)
good_example1 = '(541) 754-3010'
good_example2 = '541-754-3010'
good_example3 = '5417543010'
bad_example = '134-911-5678 '
print('good_example', validator.match(good_example1))
print('good_example', validator.match(good_example2))
print('good_example', validator.match(good_example3))
print('bad_example', validator.match(bad_example))