In [2]:
iris = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
In [6]:
iris.head()
Out[6]:
Computing Mean¶
In [5]:
mean_petal = np.mean(iris['petal_length'])
print("Mean", mean_petal)
Computing Percentiles¶
In [7]:
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
In [8]:
x,y = ecdf(iris["petal_length"])
plt.plot(x,y, marker = '.', linestyle = 'none')
plt.margins(0.02)
plt.xlabel('petal length (cm)')
plt.ylabel("ECDF")
Out[8]:
In [11]:
percentiles = np.array([2.5,5,10,20,50,75,90])
val = np.percentile(iris["petal_length"], percentiles)
print(val)
In [15]:
x,y = ecdf(iris["petal_length"])
plt.plot(x,y, marker = '.', linestyle = 'none')
plt.plot(val, percentiles/100, marker ='D', color = 'red', linestyle='none')
plt.margins(0.02)
plt.xlabel('petal length (cm)')
plt.ylabel("ECDF")
Out[15]:
Box plot¶
In [17]:
sns.boxplot("species", "petal_length", data = iris)
Out[17]:
Computing variance¶
In [32]:
total_var = np.var(iris["petal_length"])
print(total_var)
pd.pivot_table(iris, index="species",values="petal_length", aggfunc='var' )
Out[32]:
Computing standard deviation¶
In [33]:
total_std = np.std(iris["petal_length"])
print(total_std)
pd.pivot_table(iris, index="species",values="petal_length", aggfunc='std' )
Out[33]:
Understanding spread distributions¶
Let's consider there are two data sets "a" and "b"
In [20]:
a = np.array([-10,0,10,20,30])
b = np.array([8,9,10,11,12])
Computing mean for "a" and "b"¶
In [21]:
mean_a = np.mean(a)
mean_b = np.mean(b)
In [22]:
mean_a
Out[22]:
In [23]:
mean_b
Out[23]:
Here, the mean is same for both the data set, but if we examine closely, it's clearly seen that the dataset "b" is arranged closely and the dataset "a" is more dispersed
Range of the dataset¶
Range is the difference between the smallest value and the largest value in the dataset
In [27]:
range_a = np.max(a) - np.min(a)
range_b = np.max(b) - np.min(b)
In [28]:
range_a
Out[28]:
In [29]:
range_b
Out[29]:
Variance of the dataset¶
Variance is average of sum of the squared distances from all the datapoint to mean of the dataset
In [30]:
var_a = np.var(a)
var_b = np.var(b)
In [31]:
print(var_a)
print(var_b)
By comparing the variance of both the dataset, we can confirm that the dataset "b" is less dispersed.
Standard deviation of the dataset¶
Standard deviation is the square root of the variance
In [34]:
std_a = np.std(a)
std_b = np.std(b)
In [35]:
print(std_a)
print(std_b)
Correlation between two variables¶
In [37]:
plt.scatter(iris["petal_length"], iris["petal_width"])
Out[37]:
Computing the covariance matrix¶
In [39]:
cov = np.cov(iris["petal_length"], iris["petal_width"])
print(cov)
Here, [0,0] is the variance of data x, and [1,1] is the variance of data y.
[0,1] and [1,0] are the covariance of the above two dataset
[0,1] and [1,0] are the covariance of the above two dataset
In [45]:
sns.heatmap(cov, annot=True)
Out[45]:
Computing the pearson correlation coefficient¶
In [48]:
corr = np.corrcoef(iris["petal_length"], iris["petal_width"])
print(corr)
In [49]:
sns.heatmap(corr)
Out[49]:
Another way¶
In [42]:
corr=iris[['petal_length','petal_width']].corr()
print(corr)
In [44]:
sns.heatmap(corr, square=True,annot=True)
Out[44]:
No comments :
Post a Comment