NumPy by Example

Table of Contents
This is a succinct Numpy guide by example.
Updates:
- 2022-01-27 Added Percentiles
- 2022-01-25 Added Random vs Normal vs Gamma Distribution
Array Essentials
Module Import
1# Numpy module
2import numpy as np
3
4# Only used for statistics
5import scipy.stats as stats
6
7# Only for pretty table formatting in Jupyter Notebook
8from IPython.display import HTML, display
9import tabulate
10from matplotlib import pyplot
Declaring an Array
1# one row, three columns
2np.array([1,2,3])
array([1, 2, 3])
Declaring a Matrix
1# two rows, three columns
2np.array([[1,2,3],
3 [4,5,6]])
array([[1, 2, 3],
[4, 5, 6]])
Array’s Dimensions
1np.array([1,2,3]).ndim
1
1np.array([[1,2,3],[4,5,6]]).ndim
2
Array’s Rows and Columns
1# one row, three columns
2np.array([1,2,3]).shape
(3,)
1# two rows, three columns
2np.array([[1,2,3],[4,5,6]]).shape
(2, 3)
Downcasting elements’ type
1# Convert numbers to unsigned 8-bit ints
2np.array([1.5, 300.0, -5]).astype(np.uint8)
array([ 1, 44, 251], dtype=uint8)
Referencing Elements by Index
1# 2 x 2 matrix
2a = np.array([[1,2],
3 [3,4]])
1# row 0, column 0
2a[0,0]
1
1# row 1, column 0
2a[1,0]
3
1# row 1, column 1
2a[1,1]
4
Slicing (Selecting Columns)
1a = np.array([[1,2,3],
2 [4,5,6],
3 [7,8,9]])
1# all rows, just column 1 (as array)
2a[:,1]
array([2, 5, 8])
1# all rows, just column 1 (as rows containing a single column)
2a[:,[1]]
array([[2],
[5],
[8]])
1# all rows, columns starting from 1
2a[:,1:]
array([[2, 3],
[5, 6],
[8, 9]])
1# all rows, specific columns 1 and 2
2a[:,[1,2]]
array([[2, 3],
[5, 6],
[8, 9]])
1# all rows, columns from 0 to <2
2a[:,0:2]
array([[1, 2],
[4, 5],
[7, 8]])
Slicing (Selecting Rows)
1a = np.array([[1,2,3],
2 [4,5,6],
3 [7,8,9]])
1# specific row 1 (as array), all columns
2a[1,]
array([4, 5, 6])
1# specific row 1 (as one row), all columns
2a[[1],]
array([[4, 5, 6]])
1# specific rows 0 and 2, all columns
2a[[0,2],]
array([[1, 2, 3],
[7, 8, 9]])
1# rows from 0 to <2, all columns
2a[0:2,]
array([[1, 2, 3],
[4, 5, 6]])
Array Generation
With Zeros
1# 2 rows by 3 columns
2np.zeros((2,3),dtype=int)
array([[0, 0, 0],
[0, 0, 0]])
With Ones
1# 2 rows by 3 columns
2np.ones((2,3),dtype=int)
array([[1, 1, 1],
[1, 1, 1]])
With Random Int Numbers
1# 2 rows by 3 columns
2np.random.randint(5,size=(2,3))
array([[3, 3, 3],
[1, 0, 0]])
With Random Float Numbers
1# 2 rows by 3 columns
2np.random.rand(2,3)
array([[0.68481901, 0.62088938, 0.04702247],
[0.274671 , 0.24961526, 0.00312122]])
Random vs Normal vs Gamma Distributions
1# Random numbers -5.0 >= n <= +5.0
2a1 = np.random.random(size=1000)
3a1 = (a1 * 10) - 5
4
5# Normal distribution around 0 with a standard deviation of 1
6a2 = np.random.normal(loc=0.0, scale=1.0, size=1000)
7
8# Gamma distribution
9a3 = np.random.gamma(2, scale=1.0, size=1000)
10
11print(" Len Std Mean Min Max ")
12for (name,a) in [("Random",a1),("Normal",a2),("Gamma ",a3)]:
13 print("{} {} {:10.4f}{:10.4f}{:10.4f}{:10.4f}".format(name,len(a),a.std(),a.mean(),a.min(),a.max()))
14
15
16# Plot rendering only
17pyplot.ioff()
18fig = pyplot.figure(figsize=(10,8))
19one = pyplot.subplot(3, 2, 1)
20one.set_ylabel("Random")
21pyplot.scatter(range(0,1000),a1,s=1)
22
23two = pyplot.subplot(3, 2, 2)
24pyplot.hist(a1,bins=50)
25
26three = pyplot.subplot(3, 2, 3)
27three.set_ylabel("Normal")
28pyplot.scatter(range(0,1000),a2,s=1)
29
30four = pyplot.subplot(3, 2, 4, sharex=two)
31pyplot.hist(a2,bins=50)
32
33five = pyplot.subplot(3, 2, 5)
34five.set_ylabel("Gamma")
35pyplot.scatter(range(0,1000),a3,s=1)
36
37six = pyplot.subplot(3, 2, 6, sharex=two)
38pyplot.hist(a3,bins=50)
39
40pyplot.savefig("plot_gaussian.png")
41pyplot.close(fig)
Len Std Mean Min Max
Random 1000 2.8658 -0.0703 -4.9983 4.9888
Normal 1000 0.9881 0.0417 -2.8302 2.8781
Gamma 1000 1.4814 2.0395 0.0178 10.6693
With a Range of Integer Numbers
1# from 1 to <10
2np.arange(1,10)
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
1# from 1 to <10 in +2 increments
2np.arange(1,10,2)
array([1, 3, 5, 7, 9])
With a Range of Float Numbers
1# from 0 to 1 (inclusive) in 5 steps
2np.linspace(0,1,5)
array([0. , 0.25, 0.5 , 0.75, 1. ])
With a Specific Value
1# two rows by three columns, all cells filled with 5
2np.full((2,3),5)
array([[5, 5, 5],
[5, 5, 5]])
Array Arithmetic
Addition
1# element-wise addition
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a+b
array([3, 6, 9])
Subtraction
1# element-wise subtraction
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a-b
array([1, 2, 3])
Product (Element)
1# element-wise product
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a-b
array([1, 2, 3])
Product (Matrix)
1np.array([1,2])@np.array([[2,3],[4,5]])
array([10, 13])
Element-wise Arithmetic
1a = np.array([2,4,6])
2a*3
array([ 6, 12, 18])
Array Predicates
Apply Predicate to Elements
1# check which elements are divisible by two
2a = np.array([1,2,3,4,5])
3l = a % 2 == 0
4l
array([False, True, False, True, False])
1# use 'l' as a filter to obtain even numbers
2# equivalent to a[a % 2 == 0]
3a[l]
array([2, 4])
Array Aggregations
Sum
1np.array([1,2,3]).sum()
6
Max
1np.array([1,2,3]).max()
3
Min
1np.array([1,2,3]).min()
1
Mean
1np.array([1,2,3]).mean()
2.0
CSV Files
Loading CSV Data
Sample file used for example
1# this is just to show the raw contents of the file
2!cat resources/countries_small.csv
"country","population","gdp_in_trillions"
"China",1439323776,12.238
"India",1380004385,2.651
"USA",331002651,19.485
1# just the data, ignore the headers
2np.genfromtxt('resources/countries_small.csv',delimiter=',',skip_header=1,usecols=(1,2))
array([[1.43932378e+09, 1.22380000e+01],
[1.38000438e+09, 2.65100000e+00],
[3.31002651e+08, 1.94850000e+01]])
1# specify the headers so they can be used to reference columns
2a = np.genfromtxt('resources/countries_small.csv',delimiter=',', \
3 skip_header=1,names=('country','population','gdp'),dtype=None,encoding=None)
4print(a['population'])
[1439323776 1380004385 331002651]
Pearson Correlation
The Perason coefficient helps determine the correlation between two data sets.
Here we use Python arrays, rather than numpy ones, for simplicity.
100% Positive Correlation
In this case any positive change in data set (a) results also in a positive change in data set (b) and vice versa.
1a = [0,1,2,3]
2b = [5,6,7,8]
3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
4
5# Plot rendering only
6pyplot.ioff()
7fig = pyplot.figure()
8pyplot.plot(a)
9pyplot.plot(b)
10pyplot.savefig("plot_p.png")
11pyplot.close(fig)
(1.0, 0.0)
100% Negative Correlation
In this case, every positive change in data set (a) results in an equivalent negative change in data set (b) and vice versa.
1a = [10,9,8,7]
2b = [1,2,3,4]
3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
4
5# Plot rendering only
6pyplot.ioff()
7fig = pyplot.figure()
8pyplot.plot(a)
9pyplot.plot(b)
10pyplot.savefig("plot_n.png")
11pyplot.close(fig)
(-1.0, 0.0)
No Correlation
In this case positive and negative changes are equally balanced out so there is no correlation between data sets (b) and (a).
1a = [0,1,2,3,4]
2b = [6,7,8,7,6]
3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
4
5# Plot rendering only
6pyplot.ioff()
7fig = pyplot.figure()
8pyplot.plot(a)
9pyplot.plot(b)
10pyplot.savefig("plot_nc.png")
11pyplot.close(fig)
(0.0, 1.0000000000000002)
T Test (Relative)
This is a two-sided test for the null hypothesis that two samples have identical average values.
Here we use Python arrays, rather than numpy ones, for simplicity.
P-Value for Two Sets of Random Data
In theory, two sets of random data should have similar average values, and thus, a p value closer to 1 than to 0. In practice, two specific samples of random numbers may not be so random.
1a = np.random.randint(1,high=50,size=100)
2b = np.random.randint(1,high=50,size=100)
3display(stats.ttest_rel(a,b)) # P-Value
4
5# Plot rendering only
6pyplot.ioff()
7fig = pyplot.figure()
8pyplot.plot(a)
9pyplot.plot(b)
10pyplot.savefig("plot_random.png")
11pyplot.close(fig)
Ttest_relResult(statistic=1.55684703917969, pvalue=0.12269795013854484)
P-Value for Non-Random Data
Here, one data set is random, but the other one is generated using a constant function.
1a = np.random.randint(1,high=50,size=100)
2b = np.arange(0,100)
3display(stats.ttest_rel(a,b)) # P-Value
4
5# Plot rendering only
6pyplot.ioff()
7fig = pyplot.figure()
8pyplot.plot(a)
9pyplot.plot(b)
10pyplot.savefig("plot_one-random.png")
11pyplot.close(fig)
Ttest_relResult(statistic=-8.324218327435583, pvalue=4.824950835820826e-13)
Percentiles (Useful for Quartiles)
1# First, let's get some intuition
2a = np.array([0,10])
3print("Q0/Lower extreme: {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median: {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme: {}".format(np.percentile(a, 100)))
Q0/Lower extreme: 0.0
Q1/Lower quartile: 2.5
Q2/Median: 5.0
Q3/Upper quartile: 7.5
Q5/Upper extreme: 10.0
1# Random Values between 0.0 and 1.0
2a = np.random.random(size=10000)
3print("Q0/Lower extreme: {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median: {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme: {}".format(np.percentile(a, 100)))
Q0/Lower extreme: 8.472738043241446e-05
Q1/Lower quartile: 0.24406258129052405
Q2/Median: 0.49125699828327746
Q3/Upper quartile: 0.7530066592119979
Q5/Upper extreme: 0.9998353691622411
1# Normal Distribution
2a = np.random.normal(size=10000)
3print("Q0/Lower extreme: {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median: {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme: {}".format(np.percentile(a, 100)))
Q0/Lower extreme: -3.626710454253036
Q1/Lower quartile: -0.653462270645143
Q2/Median: 0.021550021041700396
Q3/Upper quartile: 0.6692707642609506
Q5/Upper extreme: 3.847333004930314