NumPy by Example

Share on:

Table of Contents

This is a succinct Numpy guide by example.

Updates:

  • 2022-01-27 Added Percentiles
  • 2022-01-25 Added Random vs Normal vs Gamma Distribution

Array Essentials

Module Import

 1# Numpy module
 2import numpy as np
 3
 4# Only used for statistics
 5import scipy.stats as stats
 6
 7# Only for pretty table formatting in Jupyter Notebook
 8from IPython.display import HTML, display
 9import tabulate
10from matplotlib import pyplot

Declaring an Array

1# one row, three columns
2np.array([1,2,3])
array([1, 2, 3])

Declaring a Matrix

1# two rows, three columns
2np.array([[1,2,3],
3          [4,5,6]])
array([[1, 2, 3],
       [4, 5, 6]])

Array’s Dimensions

1np.array([1,2,3]).ndim
1
1np.array([[1,2,3],[4,5,6]]).ndim
2

Array’s Rows and Columns

1# one row, three columns
2np.array([1,2,3]).shape
(3,)
1# two rows, three columns
2np.array([[1,2,3],[4,5,6]]).shape
(2, 3)

Downcasting elements’ type

1# Convert numbers to unsigned 8-bit ints
2np.array([1.5, 300.0, -5]).astype(np.uint8)
array([  1,  44, 251], dtype=uint8)

Referencing Elements by Index

1# 2 x 2 matrix
2a = np.array([[1,2],
3              [3,4]])
1# row 0, column 0
2a[0,0]
1
1# row 1, column 0
2a[1,0]
3
1# row 1, column 1
2a[1,1]
4

Slicing (Selecting Columns)

1a = np.array([[1,2,3],
2              [4,5,6],
3              [7,8,9]])
1# all rows, just column 1 (as array)
2a[:,1]
array([2, 5, 8])
1# all rows, just column 1 (as rows containing a single column)
2a[:,[1]]
array([[2],
       [5],
       [8]])
1# all rows, columns starting from 1
2a[:,1:]
array([[2, 3],
       [5, 6],
       [8, 9]])
1# all rows, specific columns 1 and 2
2a[:,[1,2]]
array([[2, 3],
       [5, 6],
       [8, 9]])
1# all rows, columns from 0 to <2
2a[:,0:2]
array([[1, 2],
       [4, 5],
       [7, 8]])

Slicing (Selecting Rows)

1a = np.array([[1,2,3],
2              [4,5,6],
3              [7,8,9]])
1# specific row 1 (as array), all columns
2a[1,]
array([4, 5, 6])
1# specific row 1 (as one row), all columns
2a[[1],]
array([[4, 5, 6]])
1# specific rows 0 and 2, all columns
2a[[0,2],]
array([[1, 2, 3],
       [7, 8, 9]])
1# rows from 0 to <2, all columns
2a[0:2,]
array([[1, 2, 3],
       [4, 5, 6]])

Array Generation

With Zeros

1# 2 rows by 3 columns
2np.zeros((2,3),dtype=int)
array([[0, 0, 0],
       [0, 0, 0]])

With Ones

1# 2 rows by 3 columns
2np.ones((2,3),dtype=int)
array([[1, 1, 1],
       [1, 1, 1]])

With Random Int Numbers

1# 2 rows by 3 columns
2np.random.randint(5,size=(2,3))
array([[3, 3, 3],
       [1, 0, 0]])

With Random Float Numbers

1# 2 rows by 3 columns
2np.random.rand(2,3)
array([[0.68481901, 0.62088938, 0.04702247],
       [0.274671  , 0.24961526, 0.00312122]])

Random vs Normal vs Gamma Distributions

Gaussian Distributions

 1# Random numbers -5.0 >= n <= +5.0
 2a1 = np.random.random(size=1000)
 3a1 = (a1 * 10) - 5 
 4
 5# Normal distribution around 0 with a standard deviation of 1
 6a2 = np.random.normal(loc=0.0, scale=1.0, size=1000)
 7
 8# Gamma distribution 
 9a3 = np.random.gamma(2, scale=1.0, size=1000)
10
11print("       Len      Std       Mean      Min      Max ")
12for (name,a) in [("Random",a1),("Normal",a2),("Gamma ",a3)]:
13    print("{} {} {:10.4f}{:10.4f}{:10.4f}{:10.4f}".format(name,len(a),a.std(),a.mean(),a.min(),a.max()))
14
15
16# Plot rendering only
17pyplot.ioff()
18fig = pyplot.figure(figsize=(10,8))
19one = pyplot.subplot(3, 2, 1)
20one.set_ylabel("Random")
21pyplot.scatter(range(0,1000),a1,s=1)
22
23two = pyplot.subplot(3, 2, 2)
24pyplot.hist(a1,bins=50)
25
26three = pyplot.subplot(3, 2, 3)
27three.set_ylabel("Normal")
28pyplot.scatter(range(0,1000),a2,s=1)
29
30four = pyplot.subplot(3, 2, 4, sharex=two)
31pyplot.hist(a2,bins=50)
32
33five = pyplot.subplot(3, 2, 5)
34five.set_ylabel("Gamma")
35pyplot.scatter(range(0,1000),a3,s=1)
36
37six = pyplot.subplot(3, 2, 6, sharex=two)
38pyplot.hist(a3,bins=50)
39
40pyplot.savefig("plot_gaussian.png")
41pyplot.close(fig)
       Len      Std       Mean      Min      Max 
Random 1000     2.8658   -0.0703   -4.9983    4.9888
Normal 1000     0.9881    0.0417   -2.8302    2.8781
Gamma  1000     1.4814    2.0395    0.0178   10.6693

With a Range of Integer Numbers

1# from 1 to <10
2np.arange(1,10)
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
1# from 1 to <10 in +2 increments
2np.arange(1,10,2)
array([1, 3, 5, 7, 9])

With a Range of Float Numbers

1# from 0 to 1 (inclusive) in 5 steps
2np.linspace(0,1,5)
array([0.  , 0.25, 0.5 , 0.75, 1.  ])

With a Specific Value

1# two rows by three columns, all cells filled with 5
2np.full((2,3),5)
array([[5, 5, 5],
       [5, 5, 5]])

Array Arithmetic

Addition

1# element-wise addition
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a+b
array([3, 6, 9])

Subtraction

1# element-wise subtraction
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a-b
array([1, 2, 3])

Product (Element)

1# element-wise product
2a = np.array([2,4,6])
3b = np.array([1,2,3])
4a-b
array([1, 2, 3])

Product (Matrix)

1np.array([1,2])@np.array([[2,3],[4,5]])
array([10, 13])

Element-wise Arithmetic

1a = np.array([2,4,6])
2a*3
array([ 6, 12, 18])

Array Predicates

Apply Predicate to Elements

1# check which elements are divisible by two
2a = np.array([1,2,3,4,5])
3l = a % 2 == 0
4l
array([False,  True, False,  True, False])
1# use 'l' as a filter to obtain even numbers
2# equivalent to a[a % 2 == 0]
3a[l]
array([2, 4])

Array Aggregations

Sum

1np.array([1,2,3]).sum()
6

Max

1np.array([1,2,3]).max()
3

Min

1np.array([1,2,3]).min()
1

Mean

1np.array([1,2,3]).mean()
2.0

CSV Files

Loading CSV Data

Sample file used for example

resources/countries_small.csv

1# this is just to show the raw contents of the file
2!cat resources/countries_small.csv 
"country","population","gdp_in_trillions"
"China",1439323776,12.238
"India",1380004385,2.651
"USA",331002651,19.485
1# just the data, ignore the headers
2np.genfromtxt('resources/countries_small.csv',delimiter=',',skip_header=1,usecols=(1,2))
array([[1.43932378e+09, 1.22380000e+01],
       [1.38000438e+09, 2.65100000e+00],
       [3.31002651e+08, 1.94850000e+01]])
1# specify the headers so they can be used to reference columns
2a = np.genfromtxt('resources/countries_small.csv',delimiter=',', \
3       skip_header=1,names=('country','population','gdp'),dtype=None,encoding=None)
4print(a['population'])
[1439323776 1380004385  331002651]

Pearson Correlation

The Perason coefficient helps determine the correlation between two data sets.

Here we use Python arrays, rather than numpy ones, for simplicity.

100% Positive Correlation

In this case any positive change in data set (a) results also in a positive change in data set (b) and vice versa.

Positive Correlation

 1a = [0,1,2,3]
 2b = [5,6,7,8]
 3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
 4
 5# Plot rendering only
 6pyplot.ioff()
 7fig = pyplot.figure()
 8pyplot.plot(a)
 9pyplot.plot(b)
10pyplot.savefig("plot_p.png")
11pyplot.close(fig)
(1.0, 0.0)

100% Negative Correlation

In this case, every positive change in data set (a) results in an equivalent negative change in data set (b) and vice versa.

Negative Correlation

 1a = [10,9,8,7]
 2b = [1,2,3,4]
 3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
 4
 5# Plot rendering only
 6pyplot.ioff()
 7fig = pyplot.figure()
 8pyplot.plot(a)
 9pyplot.plot(b)
10pyplot.savefig("plot_n.png")
11pyplot.close(fig)
(-1.0, 0.0)

No Correlation

In this case positive and negative changes are equally balanced out so there is no correlation between data sets (b) and (a).

No Correlation

 1a = [0,1,2,3,4]
 2b = [6,7,8,7,6]
 3display(stats.pearsonr(a,b)) # (pearson coefficient, p-value)
 4
 5# Plot rendering only
 6pyplot.ioff()
 7fig = pyplot.figure()
 8pyplot.plot(a)
 9pyplot.plot(b)
10pyplot.savefig("plot_nc.png")
11pyplot.close(fig)
(0.0, 1.0000000000000002)

T Test (Relative)

This is a two-sided test for the null hypothesis that two samples have identical average values.

Here we use Python arrays, rather than numpy ones, for simplicity.

P-Value for Two Sets of Random Data

In theory, two sets of random data should have similar average values, and thus, a p value closer to 1 than to 0. In practice, two specific samples of random numbers may not be so random.

Random Data

 1a = np.random.randint(1,high=50,size=100)
 2b = np.random.randint(1,high=50,size=100)
 3display(stats.ttest_rel(a,b)) # P-Value
 4
 5# Plot rendering only
 6pyplot.ioff()
 7fig = pyplot.figure()
 8pyplot.plot(a)
 9pyplot.plot(b)
10pyplot.savefig("plot_random.png")
11pyplot.close(fig)
Ttest_relResult(statistic=1.55684703917969, pvalue=0.12269795013854484)

P-Value for Non-Random Data

Here, one data set is random, but the other one is generated using a constant function.

One Set is Non-Random

 1a = np.random.randint(1,high=50,size=100)
 2b = np.arange(0,100)
 3display(stats.ttest_rel(a,b)) # P-Value
 4
 5# Plot rendering only
 6pyplot.ioff()
 7fig = pyplot.figure()
 8pyplot.plot(a)
 9pyplot.plot(b)
10pyplot.savefig("plot_one-random.png")
11pyplot.close(fig)
Ttest_relResult(statistic=-8.324218327435583, pvalue=4.824950835820826e-13)

Percentiles (Useful for Quartiles)

1# First, let's get some intuition 
2a = np.array([0,10])
3print("Q0/Lower extreme:  {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median:         {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme:  {}".format(np.percentile(a, 100)))
Q0/Lower extreme:  0.0
Q1/Lower quartile: 2.5
Q2/Median:         5.0
Q3/Upper quartile: 7.5
Q5/Upper extreme:  10.0
1# Random Values between 0.0 and 1.0
2a = np.random.random(size=10000)
3print("Q0/Lower extreme:  {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median:         {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme:  {}".format(np.percentile(a, 100)))
Q0/Lower extreme:  8.472738043241446e-05
Q1/Lower quartile: 0.24406258129052405
Q2/Median:         0.49125699828327746
Q3/Upper quartile: 0.7530066592119979
Q5/Upper extreme:  0.9998353691622411
1# Normal Distribution
2a = np.random.normal(size=10000)
3print("Q0/Lower extreme:  {}".format(np.percentile(a, 0)))
4print("Q1/Lower quartile: {}".format(np.percentile(a, 25)))
5print("Q2/Median:         {}".format(np.percentile(a, 50)))
6print("Q3/Upper quartile: {}".format(np.percentile(a, 75)))
7print("Q5/Upper extreme:  {}".format(np.percentile(a, 100)))
Q0/Lower extreme:  -3.626710454253036
Q1/Lower quartile: -0.653462270645143
Q2/Median:         0.021550021041700396
Q3/Upper quartile: 0.6692707642609506
Q5/Upper extreme:  3.847333004930314

Before You Leave

🤘 Subscribe to my 100% spam-free newsletter!

website counters