Friday, February 3, 2017

Empirical Cumulative Distribution Function



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
In [5]:
iris = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
In [6]:
iris.head()
Out[6]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [10]:
iris.shape[0]
Out[10]:
150
In [8]:
plt.hist(iris["petal_length"])
Out[8]:
(array([ 37.,  13.,   0.,   3.,   8.,  26.,  29.,  18.,  11.,   5.]),
 array([ 1.  ,  1.59,  2.18,  2.77,  3.36,  3.95,  4.54,  5.13,  5.72,
         6.31,  6.9 ]),
 <a list of 10 Patch objects>)

Square root rule

Normally people use the square root of the sample as a bin size
In [12]:
plt.hist(iris["petal_length"],bins=int(np.sqrt(iris.shape[0])))
Out[12]:
(array([ 23.,  27.,   0.,   0.,   3.,   8.,  18.,  25.,  18.,  17.,   7.,
          4.]),
 array([ 1.        ,  1.49166667,  1.98333333,  2.475     ,  2.96666667,
         3.45833333,  3.95      ,  4.44166667,  4.93333333,  5.425     ,
         5.91666667,  6.40833333,  6.9       ]),
 <a list of 12 Patch objects>)

Swarm plot

In [13]:
sns.swarmplot(x = "species", y = "petal_length", data = iris)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ce6f550>

Empirical Cumulative Distributive function

In [14]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

    return x, y

ECDF plot for petal length

In [16]:
x,y = ecdf(iris["petal_length"])
In [24]:
plt.plot(x,y, marker = '.', linestyle = 'none')
plt.margins(0.02)
plt.xlabel('petal length (cm)')
plt.ylabel("ECDF")
Out[24]:
<matplotlib.text.Text at 0x11e7d9978>

No comments :

Post a Comment