## Friday, January 20, 2017

### 2D Numpy array¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
[   0,    0,    2,    5,    0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[  95,  229,  255,  496,  201],
[   2,    0,    1,   27,    0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])


### Accessing Numpy array:¶

In [2]:
print(ridership[1,4])  # In Python list of lists, we can access using a[i][j]. But, in numpy, it's a[i,j]

2539

In [3]:
print(len(ridership))

10

In [4]:
print(len(ridership[0]))

5

In [5]:
print(ridership[1:6,0:4])

[[1478 3877 3674 2328]
[1613 4088 3991 6461]
[1560 3392 3826 4787]
[1608 4802 3932 4477]
[1576 3933 3909 4979]]


### vectorized operations¶

In [6]:
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])

In [7]:
print(a)
print(b)

[[1 2 3]
[4 5 6]
[7 8 9]]
[[1 1 1]
[2 2 2]
[3 3 3]]

In [8]:
print(a+b)

[[ 2  3  4]
[ 6  7  8]
[10 11 12]]

In [9]:
first_col = ridership[:,0:1]

In [10]:
print(first_col)

[[   0]
[1478]
[1613]
[1560]
[1608]
[1576]
[  95]
[   2]
[1438]
[1342]]

In [11]:
print(first_col.mean())

1071.2

In [12]:
print(ridership.mean(axis=0)) #mean for column

[ 1071.2  2814.9  2718.8  3239.9  1868.2]

In [13]:
print(ridership.mean(axis = 1)) #mean for row

[  1.40000000e+00   2.77920000e+03   3.76880000e+03   3.23560000e+03
3.50480000e+03   3.41640000e+03   2.55200000e+02   6.00000000e+00
3.04020000e+03   3.41840000e+03]


### pandas DataFrame¶

In [14]:
student = pd.DataFrame({"name":["Arun","Prakash","Balaji","TR","Chidu"],
"age":[25,24,25,26,27]})

In [15]:
print(student)

   age     city     name
0   25  chennai     Arun
1   24  chennai  Prakash
2   25   trichy   Balaji

In [16]:
student = pd.DataFrame(data = student, index=["student1","student2","student3","student4","student5"])
print(student)

          age city name
student1  NaN  NaN  NaN
student2  NaN  NaN  NaN
student3  NaN  NaN  NaN
student4  NaN  NaN  NaN
student5  NaN  NaN  NaN

In [17]:
student = {"name":["Arun","Prakash","Balaji","TR","Chidu"],
"age":[25,24,25,26,27]}

In [18]:
student = pd.DataFrame(data = student, index=["student1","student2","student3","student4","student5"])
print(student)

          age     city     name
student1   25  chennai     Arun
student2   24  chennai  Prakash
student3   25   trichy   Balaji

In [19]:
student.loc["student1"]

Out[19]:
age          25
city    chennai
name       Arun
Name: student1, dtype: object
In [20]:
student.iloc[0]

Out[20]:
age          25
city    chennai
name       Arun
Name: student1, dtype: object
In [21]:
student.loc["student1","name"]

Out[21]:
'Arun'

### converting pandas dataframe to numpy array¶

In [22]:
print(student.values)

[[25 'chennai' 'Arun']
[24 'chennai' 'Prakash']
[25 'trichy' 'Balaji']

In [23]:
print(type(student.values))

<class 'numpy.ndarray'>


### pandas practice¶

In [24]:
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])

In [25]:
print(df_2)

   A  B  C
0  0  1  2
1  3  4  5

In [26]:
df_2 = pd.DataFrame(data = df_2.values, columns=["A","B","C"],index= ["row1","row2"])

In [27]:
print(df_2)

      A  B  C
row1  0  1  2
row2  3  4  5


### Calculating correlation between variables (Pearson's r method)¶

Earlier, we calculated the correlation by just checking the relationship between increase or decrease in value of one variable with another variable. Here, we are going to calculate correlation between two variables using pearson's r method.
• Standardize the variable values ( Pandas tip : use var.std(ddof=0) , uncorrected standard deviation)
• Multiply each pair of values and then take average.
• The value of r range in between -1 and +1.
• If the values is positive, then it's positively correlated, if it's negative, then it's negatively correlated.
In [28]:
subway_df = pd.read_csv("nyc_subway_weather.csv")

In [29]:
print(subway_df.head())

   UNIT     DATEn     TIMEn  ENTRIESn   EXITSn  ENTRIESn_hourly  \
0  R003  05-01-11  00:00:00   4388333  2911002              0.0
1  R003  05-01-11  04:00:00   4388333  2911002              0.0
2  R003  05-01-11  12:00:00   4388333  2911002              0.0
3  R003  05-01-11  16:00:00   4388333  2911002              0.0
4  R003  05-01-11  20:00:00   4388333  2911002              0.0

EXITSn_hourly             datetime  hour  day_week     ...       pressurei  \
0            0.0  2011-05-01 00:00:00     0         6     ...           30.22
1            0.0  2011-05-01 04:00:00     4         6     ...           30.25
2            0.0  2011-05-01 12:00:00    12         6     ...           30.28
3            0.0  2011-05-01 16:00:00    16         6     ...           30.26
4            0.0  2011-05-01 20:00:00    20         6     ...           30.28

rain  tempi  wspdi meanprecipi  meanpressurei  meantempi  meanwspdi  \
0    0   55.9    3.5         0.0         30.258      55.98       7.86
1    0   52.0    3.5         0.0         30.258      55.98       7.86
2    0   62.1    6.9         0.0         30.258      55.98       7.86
3    0   57.9   15.0         0.0         30.258      55.98       7.86
4    0   52.0   10.4         0.0         30.258      55.98       7.86

weather_lat  weather_lon
0    40.700348   -73.887177
1    40.700348   -73.887177
2    40.700348   -73.887177
3    40.700348   -73.887177
4    40.700348   -73.887177

[5 rows x 27 columns]

In [30]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

In [31]:
print(entries.head(20))

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
5      15.0
6      19.0
7     488.0
8     490.0
9     231.0
10    235.0
11     74.0
12     20.0
13    975.0
14    267.0
15    277.0
16     83.0
17     24.0
18    532.0
19    454.0
Name: ENTRIESn_hourly, dtype: float64

In [32]:
print(cum_entries.head())

0    4388333
1    4388333
2    4388333
3    4388333
4    4388333
Name: ENTRIESn, dtype: int64

In [33]:
print(rain.head(20))

0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.00
10    0.00
11    0.00
12    0.00
13    0.00
14    0.00
15    0.00
16    0.01
17    0.01
18    0.01
19    0.01
Name: meanprecipi, dtype: float64

In [34]:
print(temp.head())

0    55.98
1    55.98
2    55.98
3    55.98
4    55.98
Name: meantempi, dtype: float64


### Correlation function¶

In [35]:
def correlation(x,y):
#step 1
x = (x - x.mean())/x.std(ddof=0)
y = (y - y.mean())/y.std(ddof = 0)
#step 2
r = (x*y).mean()
return r

In [36]:
print(correlation(entries, rain))
print(correlation(entries, temp))
print(correlation(rain, temp))
print(correlation(entries, cum_entries))

0.03564851577223041
-0.026693348321569912
-0.22903432340833663
0.5858954707662182


### Positive correlation example¶

In [37]:
plt.plot(cum_entries,entries)

Out[37]:
[<matplotlib.lines.Line2D at 0x11b22c860>]

### Negative correlation example¶

In [38]:
plt.plot(temp,rain)

Out[38]:
[<matplotlib.lines.Line2D at 0x11bbc46d8>]

### applymap( ) function¶

In [40]:
#Example 1
df = pd.DataFrame({
'a': [1, 2, 3],
'b': [10, 20, 30],
'c': [5, 10, 15]
})

return x + 1


   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16

In [42]:
#Example 2
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

'''
Fill in this function to convert the given DataFrame of numerical

The conversion rule is:
90-100 -> A
80-89  -> B
70-79  -> C
60-69  -> D
0-59   -> F
'''
else:

In [43]:
print(grades_df)

         exam1  exam2
Andre       43     24
Barry       81     63
Chris       78     56
Dan         75     56
Emilio      89     67
Fred        70     51
Greta       91     79
Humbert     65     46
Ivan        98     72
James       87     60

In [44]:
print(grades_df.applymap(convert_grades))

        exam1 exam2
Andre       F     F
Barry       B     D
Chris       C     F
Dan         C     F
Emilio      B     D
Fred        C     F
Greta       A     C
Humbert     D     F
Ivan        A     C
James       B     D


### apply( ) function¶

In [45]:
df = pd.DataFrame({
'a': [4, 5, 3, 1, 2],
'b': [20, 10, 40, 50, 30],
'c': [25, 20, 5, 15, 10]
})

In [46]:
print(df)

   a   b   c
0  4  20  25
1  5  10  20
2  3  40   5
3  1  50  15
4  2  30  10

In [51]:
print(df.apply(np.mean))

a     3.0
b    30.0
c    15.0
dtype: float64

In [52]:
print(df.apply(np.max))

a     5
b    50
c    25
dtype: int64

Now let's try to find the second max element in a dataframe using apply funtion.
In [92]:
def second_max(data):
data_2 = data
max_data_2 = max(data_2)
data_2 = data_2.drop(pd.Index(data_2).get_loc(max_data_2))
return max(data_2)

In [93]:
check =  df["a"]
print(check)
print(max(check))
pos = pd.Index(check).get_loc(max(check))
aa = check.drop(pos)
print(check)
print(aa)

0    4
1    5
2    3
3    1
4    2
Name: a, dtype: int64
5
0    4
1    5
2    3
3    1
4    2
Name: a, dtype: int64
0    4
2    3
3    1
4    2
Name: a, dtype: int64

In [94]:
print(df.apply(second_max))

a     4
b    40
c    20
dtype: int64


### Adding a series to a dataframe¶

In [95]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})

In [96]:
print(df)

    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

In [97]:
print(s)

0    1
1    2
2    3
3    4
dtype: int64

In [98]:
print(df+s)

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164

In [99]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})

In [102]:
print(df)
print(s)

    0   1   2   3
0  10  20  30  40
0    1
1    2
2    3
3    4
dtype: int64

In [101]:
print(df+s)

    0   1   2   3
0  11  22  33  44

In [105]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print(df)
print(s)

    0
0  10
1  20
2  30
3  40
0    1
1    2
2    3
3    4
dtype: int64

In [104]:
print(df+s)

    0   1   2   3
0  11 NaN NaN NaN
1  21 NaN NaN NaN
2  31 NaN NaN NaN
3  41 NaN NaN NaN

In [106]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print(df)
print(s)

    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
a    1
b    2
c    3
d    4
dtype: int64

In [107]:
print(df+s)

    a   b    c    d
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164


### groupby( ) function¶

In [108]:
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

In [109]:
print(example_df)

  above_three   even  value
a       False  False      1
b       False  False      3
c       False   True      2
d        True   True      4
e       False  False      1
f        True   True      6
g        True   True      4

In [119]:
print(example_df.groupby("even"))
print(example_df.groupby("even").sum())

<pandas.core.groupby.DataFrameGroupBy object at 0x11bd642e8>
above_three  value
even
False          0.0      5
True           3.0     16

In [116]:
print(example_df.groupby("even").groups)

{False: ['a', 'b', 'e'], True: ['c', 'd', 'f', 'g']}

In [117]:
# groupby multiple columns

grouped_data = example_df.groupby(['even', 'above_three'])
print(grouped_data.groups)

{(True, False): ['c'], (False, False): ['a', 'b', 'e'], (True, True): ['d', 'f', 'g']}


### Merging multiple pandas Dataframes¶

In [131]:
subway_df = pd.DataFrame({
'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
'R004', 'R004'],
'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'ENTRIESn': [ 4388333,  4388348,  4389885,  4391507,  4393043, 14656120,
14656174, 14660126, 14664247, 14668301],
'EXITSn': [ 2911002,  2911036,  2912127,  2913223,  2914284, 14451774,
14451851, 14454734, 14457780, 14460818],
'latitude': [ 40.689945,  40.689945,  40.689945,  40.689945,  40.689945,
40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ],
'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
-73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})

weather_df = pd.DataFrame({
'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
'05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'latitude': [ 40.689945,  40.69132 ,  40.689945,  40.69132 ,  40.689945,
40.69132 ,  40.689945,  40.69132 ,  40.689945,  40.69132 ],
'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
-73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
'pressurei': [ 30.24,  30.24,  30.32,  30.32,  30.14,  30.14,  29.98,  29.98,
30.01,  30.01],
'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'tempi': [ 52. ,  52. ,  48.9,  48.9,  54. ,  54. ,  57.2,  57.2,  48.9,  48.9],
'wspdi': [  8.1,   8.1,   6.9,   6.9,   3.5,   3.5,  15. ,  15. ,  15. ,  15. ]
})

In [132]:
subway_df

Out[132]:
DATEn ENTRIESn EXITSn UNIT hour latitude longitude
0 05-01-11 4388333 2911002 R003 0 40.689945 -73.872564
1 05-02-11 4388348 2911036 R003 0 40.689945 -73.872564
2 05-03-11 4389885 2912127 R003 0 40.689945 -73.872564
3 05-04-11 4391507 2913223 R003 0 40.689945 -73.872564
4 05-05-11 4393043 2914284 R003 0 40.689945 -73.872564
5 05-01-11 14656120 14451774 R004 0 40.691320 -73.867135
6 05-02-11 14656174 14451851 R004 0 40.691320 -73.867135
7 05-03-11 14660126 14454734 R004 0 40.691320 -73.867135
8 05-04-11 14664247 14457780 R004 0 40.691320 -73.867135
9 05-05-11 14668301 14460818 R004 0 40.691320 -73.867135
In [133]:
weather_df

Out[133]:
DATEn fog hour latitude longitude pressurei rain tempi wspdi
0 05-01-11 0 0 40.689945 -73.872564 30.24 0 52.0 8.1
1 05-01-11 0 0 40.691320 -73.867135 30.24 0 52.0 8.1
2 05-02-11 0 0 40.689945 -73.872564 30.32 0 48.9 6.9
3 05-02-11 0 0 40.691320 -73.867135 30.32 0 48.9 6.9
4 05-03-11 0 0 40.689945 -73.872564 30.14 0 54.0 3.5
5 05-03-11 0 0 40.691320 -73.867135 30.14 0 54.0 3.5
6 05-04-11 0 0 40.689945 -73.872564 29.98 0 57.2 15.0
7 05-04-11 0 0 40.691320 -73.867135 29.98 0 57.2 15.0
8 05-05-11 0 0 40.689945 -73.872564 30.01 0 48.9 15.0
9 05-05-11 0 0 40.691320 -73.867135 30.01 0 48.9 15.0
In [134]:
subway_df.merge(weather_df, on = ["DATEn","hour","latitude","longitude"],how = "inner")

Out[134]:
DATEn ENTRIESn EXITSn UNIT hour latitude longitude fog pressurei rain tempi wspdi
0 05-01-11 4388333 2911002 R003 0 40.689945 -73.872564 0 30.24 0 52.0 8.1
1 05-02-11 4388348 2911036 R003 0 40.689945 -73.872564 0 30.32 0 48.9 6.9
2 05-03-11 4389885 2912127 R003 0 40.689945 -73.872564 0 30.14 0 54.0 3.5
3 05-04-11 4391507 2913223 R003 0 40.689945 -73.872564 0 29.98 0 57.2 15.0
4 05-05-11 4393043 2914284 R003 0 40.689945 -73.872564 0 30.01 0 48.9 15.0
5 05-01-11 14656120 14451774 R004 0 40.691320 -73.867135 0 30.24 0 52.0 8.1
6 05-02-11 14656174 14451851 R004 0 40.691320 -73.867135 0 30.32 0 48.9 6.9
7 05-03-11 14660126 14454734 R004 0 40.691320 -73.867135 0 30.14 0 54.0 3.5
8 05-04-11 14664247 14457780 R004 0 40.691320 -73.867135 0 29.98 0 57.2 15.0
9 05-05-11 14668301 14460818 R004 0 40.691320 -73.867135 0 30.01 0 48.9 15.0

### Plotting¶

In [136]:
subway_df = pd.read_csv("nyc_subway_weather.csv")

Out[136]:
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly datetime hour day_week ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 R003 05-01-11 00:00:00 4388333 2911002 0.0 0.0 2011-05-01 00:00:00 0 6 ... 30.22 0 55.9 3.5 0.0 30.258 55.98 7.86 40.700348 -73.887177
1 R003 05-01-11 04:00:00 4388333 2911002 0.0 0.0 2011-05-01 04:00:00 4 6 ... 30.25 0 52.0 3.5 0.0 30.258 55.98 7.86 40.700348 -73.887177
2 R003 05-01-11 12:00:00 4388333 2911002 0.0 0.0 2011-05-01 12:00:00 12 6 ... 30.28 0 62.1 6.9 0.0 30.258 55.98 7.86 40.700348 -73.887177
3 R003 05-01-11 16:00:00 4388333 2911002 0.0 0.0 2011-05-01 16:00:00 16 6 ... 30.26 0 57.9 15.0 0.0 30.258 55.98 7.86 40.700348 -73.887177
4 R003 05-01-11 20:00:00 4388333 2911002 0.0 0.0 2011-05-01 20:00:00 20 6 ... 30.28 0 52.0 10.4 0.0 30.258 55.98 7.86 40.700348 -73.887177
5 rows × 27 columns
In [139]:
data_by_location = subway_df.groupby(["latitude","longitude"],as_index=False).mean()

In [140]:
data_by_location.head()

Out[140]:
latitude longitude ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly hour day_week weekday fog ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 40.576152 -73.975925 9.659049e+06 8.641132e+06 403.896175 325.956284 10.032787 2.907104 0.715847 0.010929 ... 29.972568 0.229508 63.383607 5.553005 0.006284 29.972568 63.383607 5.553005 40.603489 -73.958763
1 40.576298 -73.968523 8.306897e+06 6.646823e+06 526.697297 419.562162 9.989189 2.951351 0.708108 0.010811 ... 29.973297 0.227027 63.375135 5.517838 0.006216 29.973297 63.375135 5.517838 40.603489 -73.958763
2 40.577961 -73.961806 4.552910e+07 4.612408e+07 1950.295699 1930.483871 10.000000 2.935484 0.709677 0.010753 ... 29.973118 0.225806 63.394086 5.531720 0.006183 29.973118 63.394086 5.531720 40.603489 -73.958763
3 40.589547 -73.974295 7.268214e+06 7.961334e+06 485.382353 362.941176 10.164706 2.905882 0.705882 0.011765 ... 29.971176 0.200000 63.650588 5.630588 0.006118 29.971176 63.650588 5.630588 40.603489 -73.958763
4 40.590867 -73.797011 6.477945e+06 5.994957e+06 500.725610 374.628049 10.097561 2.951220 0.719512 0.024390 ... 29.981098 0.195122 61.721341 9.945122 0.002744 29.981098 61.721341 9.945122 40.660004 -73.844849
5 rows × 21 columns
In [142]:
plt.scatter(data_by_location["latitude"],data_by_location["longitude"], s=data_by_location["ENTRIESn_hourly"])

Out[142]:
<matplotlib.collections.PathCollection at 0x12410de48>
In [145]:
scaled_entries = (data_by_location["ENTRIESn_hourly"]/data_by_location["ENTRIESn_hourly"].std())

In [146]:
plt.scatter(data_by_location["latitude"],data_by_location["longitude"], s=scaled_entries)

Out[146]:
<matplotlib.collections.PathCollection at 0x129498898>