Tuesday, January 17, 2017

Numpy and Pandas for 1 dimensional data


In [2]
:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

Numpy library

Numpy (Numerical python is very similar to python lists. We can do operations like list indexing,slicing in numpy. However, there are some differences in numpy compared to python lists. In Numpy, all the data should be of same datatype. It has some additional functions like mean(), std(). It can support multi-dimensional array.
In [3]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])
In [4]:
print(countries)
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina']
In [5]:
print(employment)
[ 55.70000076  51.40000153  50.5         75.69999695  58.40000153
  40.09999847  61.5         57.09999847  60.90000153  66.59999847
  60.40000153  68.09999847  66.90000153  53.40000153  48.59999847
  56.79999924  71.59999847  58.40000153  70.40000153  41.20000076]

Accessing elements

In [6]:
countries[1:5]
Out[6]:
array(['Albania', 'Algeria', 'Angola', 'Argentina'], 
      dtype='<U22')
In [7]:
countries[10]
Out[7]:
'Bahrain'

Element types

In [8]:
print(type(countries))
<class 'numpy.ndarray'>
In [9]:
print(countries.dtype)
<U22
In [10]:
print(np.array([1,2,3]).dtype)
int64
In [11]:
print(np.array([True,False,True]).dtype)
bool
In [12]:
print(np.array(['a','b','c']).dtype)
<U1
In [13]:
print(np.array(["hello","hi"]).dtype)
<U5
In [14]:
print(np.array(["hell","hi"]).dtype)
<U4

numpy functions.

In [15]:
print(employment.max())
print(employment.min())
print(employment.mean())
print(employment.std())
75.69999695
40.09999847
58.6850000385
9.33826911369

Highest employment rate country:

In [16]:
def max_employment(countries,eemployment):
    max_country = ''
    max_employment = 0
    for i in range(len(countries)):
        if employment[i] > max_employment:
            max_employment = employment[i]
            max_country = countries[i]
    print(max_country,max_employment)

max_employment(countries,employment)
Angola 75.69999695
In [17]:
print(employment.max())
max_pos = employment.argmax() #position of maximum value
print(countries[max_pos])
75.69999695
Angola

+ operator in numpy vs standard python

In numpy, vector is a list of numbers. If we add two vectors, it follows vector addition.
In [18]:
num1 = np.array([1,2,3])
num2 = np.array([4,5,6])
print(num1+num2)
[5 7 9]
In [19]:
num1 = [1,2,3]
num2 = [4,5,6]
print(num1+num2)
[1, 2, 3, 4, 5, 6]

standardizing an numpy array (Normalization)

$ z = \frac{x - \mu}{\sigma} $
where $\mu $ is mean, and $ \sigma $ is standard deviation.
In [20]:
arr = np.array([10,20,30,90,80,50,100,35,80])
print("mean = "+ str(arr.mean()))
print("std deviation = " + str(arr.std()))
arr = (arr - arr.mean())/arr.std()
print(arr)
mean = 55.0
std deviation = 31.2694383988
[-1.43910484 -1.11930376 -0.79950269  1.11930376  0.79950269 -0.15990054
  1.43910484 -0.63960215  0.79950269]

Index array in Numpy

In [21]:
a = [1,2,3,4,5]
b = [True,True,False,False,False]
print(a[b])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-21-5f4123f05ac8> in <module>()
      1 a = [1,2,3,4,5]
      2 b = [True,True,False,False,False]
----> 3 print(a[b])

TypeError: list indices must be integers or slices, not list
In [26]:
a = np.array(a)
b = np.array(b)
print(a[b])
[1 2]

+ vs +=

In [27]:
a = np.array([1,2,3])
b = a
a = a+ np.array([4,5,6])
print(b)
[1 2 3]
In [28]:
a = np.array([1,2,3])
b = a
a+=np.array([4,5,6])
print(b)
[5 7 9]

in-place vs not in-place

In [29]:
li = [1,2,3,4,5]
sl = li[0:3]
sl[0] = 100
print(li)
[1, 2, 3, 4, 5]
In [30]:
li = np.array([1,2,3,4,5,6])
sl = li[0:3]
sl[0] = 100
print(li)
[100   2   3   4   5   6]

Pandas demo

In [31]:
life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]
In [32]:
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)
In [33]:
print(life_expectancy)
0     74.7
1     75.0
2     83.4
3     57.6
4     74.6
5     75.4
6     72.3
7     81.5
8     80.2
9     70.3
10    72.1
11    76.4
12    68.1
13    75.2
14    69.8
15    79.4
16    70.8
17    62.7
18    67.3
19    70.6
dtype: float64
In [34]:
print(gdp)
0      1681.613910
1      2155.485231
2     21495.805083
3       562.987685
4     13495.127466
5      9388.688523
6      1424.190562
7     24765.548902
8     27036.487332
9      1945.637549
10    21721.618410
11    13373.219940
12      483.970868
13     9783.984173
14     2253.464111
15    25034.666923
16     3680.916429
17      366.044967
18     1175.926387
19     1132.213880
dtype: float64

Accessing Series elements using indexing.

In [35]:
print(gdp[0:5])
0     1681.613910
1     2155.485231
2    21495.805083
3      562.987685
4    13495.127466
dtype: float64

pandas functions

In [36]:
print(gdp.mean())
print(gdp.max())
print(gdp.min())
9147.879916483502
27036.4873319
366.04496652
In [37]:
print(life_expectancy.mean())
print(life_expectancy.max())
print(life_expectancy.min())
72.86999999999999
83.4
57.6
In [40]:
plt.plot(life_expectancy)
Out[40]:
[<matplotlib.lines.Line2D at 0x1189df940>]
In [42]:
plt.plot(gdp)
Out[42]:
[<matplotlib.lines.Line2D at 0x118b13390>]
In [43]:
life_mean = life_expectancy.mean()
gdp_mean = gdp.mean()
gdp_above = [gdp>gdp_mean]
life_above = [life_expectancy>life_mean]
gdp_below = [gdp<gdp_mean]
life_below = [life_expectancy<life_mean]
In [44]:
relation_above = gdp_above and life_above
relation_below = gdp_below and life_below
In [45]:
print(relation_above)
[0      True
1      True
2      True
3     False
4      True
5      True
6     False
7      True
8      True
9     False
10    False
11     True
12    False
13     True
14    False
15     True
16    False
17    False
18    False
19    False
dtype: bool]
In [46]:
print(relation_below)
[0     False
1     False
2     False
3      True
4     False
5     False
6      True
7     False
8     False
9      True
10     True
11    False
12     True
13    False
14     True
15    False
16     True
17     True
18     True
19     True
dtype: bool]
In [47]:
true_count = 0
false_count =0

for i in range(len(relation_above)):
    if relation_above[i] is False and relation_below[i] is False:
        false_count = false_count + 1
print(false_count)
0
In [48]:
print(gdp_above)
[0     False
1     False
2      True
3     False
4      True
5      True
6     False
7      True
8      True
9     False
10     True
11     True
12    False
13     True
14    False
15     True
16    False
17    False
18    False
19    False
dtype: bool]
In [49]:
print(life_above)
[0      True
1      True
2      True
3     False
4      True
5      True
6     False
7      True
8      True
9     False
10    False
11     True
12    False
13     True
14    False
15     True
16    False
17    False
18    False
19    False
dtype: bool]
In [50]:
true_count = 0
false_count = 0
for i in range(len(life_above[0])):
    if gdp_above[0][i] and life_above[0][i] :
        true_count = true_count +1
    if  not gdp_above[0][i]  and not life_above[0][i]:
         true_count = true_count +1
print(true_count)
print(len(life_above[0])-true_count)
17
3
In [51]:
countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
]

employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
]
In [52]:
employment = pd.Series(employment_values,index=countries)
print(employment)
Afghanistan               55.700001
Albania                   51.400002
Algeria                   50.500000
Angola                    75.699997
Argentina                 58.400002
Armenia                   40.099998
Australia                 61.500000
Austria                   57.099998
Azerbaijan                60.900002
Bahamas                   66.599998
Bahrain                   60.400002
Bangladesh                68.099998
Barbados                  66.900002
Belarus                   53.400002
Belgium                   48.599998
Belize                    56.799999
Benin                     71.599998
Bhutan                    58.400002
Bolivia                   70.400002
Bosnia and Herzegovina    41.200001
dtype: float64

loc vs iloc in pandas series

In [53]:
print(employment.iloc[0])
print(employment.loc["Algeria"])
55.70000076
50.5
In [54]:
print(employment.argmax())
print(employment.loc["Angola"])
Angola
75.69999695

addition when indexes are same

In [55]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s1+s2)
a    11
b    22
c    33
d    44
dtype: int64

addition when some indexes are not same

In [56]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'e'])
print(s1+s2)
a    11.0
b    22.0
c    33.0
d     NaN
e     NaN
dtype: float64

addition when all the element indexes are different

In [57]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
print(s1+s2)
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64

dropna, fillna

In [58]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'e'])
s3 = s1+s2
print(s3)
s3 = s3.dropna()
print(s3)
a    11.0
b    22.0
c    33.0
d     NaN
e     NaN
dtype: float64
a    11.0
b    22.0
c    33.0
dtype: float64
In [59]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'e'])
s3 = s1+s2
s3 = s3.fillna(0)
print(s3)
a    11.0
b    22.0
c    33.0
d     0.0
e     0.0
dtype: float64

apply function

In [60]:
s = pd.Series([1, 2, 3, 4, 5])
def add_one(x):
    return x + 1
print (s.apply(add_one))
0    2
1    3
2    4
3    5
4    6
dtype: int64

No comments :

Post a Comment