2D Numpy array¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
Accessing Numpy array:¶
In [2]:
print(ridership[1,4]) # In Python list of lists, we can access using a[i][j]. But, in numpy, it's a[i,j]
In [3]:
print(len(ridership))
In [4]:
print(len(ridership[0]))
In [5]:
print(ridership[1:6,0:4])
vectorized operations¶
In [6]:
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
In [7]:
print(a)
print(b)
In [8]:
print(a+b)
In [9]:
first_col = ridership[:,0:1]
In [10]:
print(first_col)
In [11]:
print(first_col.mean())
In [12]:
print(ridership.mean(axis=0)) #mean for column
In [13]:
print(ridership.mean(axis = 1)) #mean for row
pandas DataFrame¶
In [14]:
student = pd.DataFrame({"name":["Arun","Prakash","Balaji","TR","Chidu"],
"city":["chennai","chennai","trichy","madurai","madurai"],
"age":[25,24,25,26,27]})
In [15]:
print(student)
In [16]:
student = pd.DataFrame(data = student, index=["student1","student2","student3","student4","student5"])
print(student)
In [17]:
student = {"name":["Arun","Prakash","Balaji","TR","Chidu"],
"city":["chennai","chennai","trichy","madurai","madurai"],
"age":[25,24,25,26,27]}
In [18]:
student = pd.DataFrame(data = student, index=["student1","student2","student3","student4","student5"])
print(student)
In [19]:
student.loc["student1"]
Out[19]:
In [20]:
student.iloc[0]
Out[20]:
In [21]:
student.loc["student1","name"]
Out[21]:
converting pandas dataframe to numpy array¶
In [22]:
print(student.values)
In [23]:
print(type(student.values))
pandas practice¶
In [24]:
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
In [25]:
print(df_2)
In [26]:
df_2 = pd.DataFrame(data = df_2.values, columns=["A","B","C"],index= ["row1","row2"])
In [27]:
print(df_2)
Calculating correlation between variables (Pearson's r method)¶
Earlier, we calculated the correlation by just checking the relationship between increase or decrease in value of one variable with another variable. Here, we are going to calculate correlation between two variables using pearson's r method.- Standardize the variable values ( Pandas tip : use var.std(ddof=0) , uncorrected standard deviation)
- Multiply each pair of values and then take average.
- The value of r range in between -1 and +1.
- If the values is positive, then it's positively correlated, if it's negative, then it's negatively correlated.
In [28]:
subway_df = pd.read_csv("nyc_subway_weather.csv")
In [29]:
print(subway_df.head())
In [30]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']
In [31]:
print(entries.head(20))
In [32]:
print(cum_entries.head())
In [33]:
print(rain.head(20))
In [34]:
print(temp.head())
Correlation function¶
In [35]:
def correlation(x,y):
#step 1
x = (x - x.mean())/x.std(ddof=0)
y = (y - y.mean())/y.std(ddof = 0)
#step 2
r = (x*y).mean()
return r
In [36]:
print(correlation(entries, rain))
print(correlation(entries, temp))
print(correlation(rain, temp))
print(correlation(entries, cum_entries))
Positive correlation example¶
In [37]:
plt.plot(cum_entries,entries)
Out[37]:
Negative correlation example¶
In [38]:
plt.plot(temp,rain)
Out[38]:
applymap( ) function¶
In [40]:
#Example 1
df = pd.DataFrame({
'a': [1, 2, 3],
'b': [10, 20, 30],
'c': [5, 10, 15]
})
def add_one(x):
return x + 1
print(df.applymap(add_one))
In [42]:
#Example 2
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def convert_grades(grades):
'''
Fill in this function to convert the given DataFrame of numerical
grades to letter grades. Return a new DataFrame with the converted
grade.
The conversion rule is:
90-100 -> A
80-89 -> B
70-79 -> C
60-69 -> D
0-59 -> F
'''
if grades >=90:
grades = "A"
elif grades >=80 and grades<90:
grades = "B"
elif grades >=70 and grades < 80:
grades = "C"
elif grades >=60 and grades <70:
grades = "D"
else:
grades = "F"
return grades
In [43]:
print(grades_df)
In [44]:
print(grades_df.applymap(convert_grades))
apply( ) function¶
In [45]:
df = pd.DataFrame({
'a': [4, 5, 3, 1, 2],
'b': [20, 10, 40, 50, 30],
'c': [25, 20, 5, 15, 10]
})
In [46]:
print(df)
In [51]:
print(df.apply(np.mean))
In [52]:
print(df.apply(np.max))
Now let's try to find the second max element in a dataframe using apply funtion.
In [92]:
def second_max(data):
data_2 = data
max_data_2 = max(data_2)
data_2 = data_2.drop(pd.Index(data_2).get_loc(max_data_2))
return max(data_2)
In [93]:
check = df["a"]
print(check)
print(max(check))
pos = pd.Index(check).get_loc(max(check))
aa = check.drop(pos)
print(check)
print(aa)
In [94]:
print(df.apply(second_max))
Adding a series to a dataframe¶
In [95]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
In [96]:
print(df)
In [97]:
print(s)
In [98]:
print(df+s)
In [99]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
In [102]:
print(df)
print(s)
In [101]:
print(df+s)
In [105]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print(df)
print(s)
In [104]:
print(df+s)
In [106]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print(df)
print(s)
In [107]:
print(df+s)
groupby( ) function¶
In [108]:
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
In [109]:
print(example_df)
In [119]:
print(example_df.groupby("even"))
print(example_df.groupby("even").sum())
In [116]:
print(example_df.groupby("even").groups)
In [117]:
# groupby multiple columns
grouped_data = example_df.groupby(['even', 'above_three'])
print(grouped_data.groups)
Merging multiple pandas Dataframes¶
In [131]:
subway_df = pd.DataFrame({
'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
'R004', 'R004'],
'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'ENTRIESn': [ 4388333, 4388348, 4389885, 4391507, 4393043, 14656120,
14656174, 14660126, 14664247, 14668301],
'EXITSn': [ 2911002, 2911036, 2912127, 2913223, 2914284, 14451774,
14451851, 14454734, 14457780, 14460818],
'latitude': [ 40.689945, 40.689945, 40.689945, 40.689945, 40.689945,
40.69132 , 40.69132 , 40.69132 , 40.69132 , 40.69132 ],
'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
-73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})
weather_df = pd.DataFrame({
'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
'05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'latitude': [ 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945,
40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 ],
'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
-73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
'pressurei': [ 30.24, 30.24, 30.32, 30.32, 30.14, 30.14, 29.98, 29.98,
30.01, 30.01],
'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'tempi': [ 52. , 52. , 48.9, 48.9, 54. , 54. , 57.2, 57.2, 48.9, 48.9],
'wspdi': [ 8.1, 8.1, 6.9, 6.9, 3.5, 3.5, 15. , 15. , 15. , 15. ]
})
In [132]:
subway_df
Out[132]:
In [133]:
weather_df
Out[133]:
In [134]:
subway_df.merge(weather_df, on = ["DATEn","hour","latitude","longitude"],how = "inner")
Out[134]:
Plotting¶
In [136]:
subway_df = pd.read_csv("nyc_subway_weather.csv")
subway_df.head()
Out[136]:
In [139]:
data_by_location = subway_df.groupby(["latitude","longitude"],as_index=False).mean()
In [140]:
data_by_location.head()
Out[140]:
In [142]:
plt.scatter(data_by_location["latitude"],data_by_location["longitude"], s=data_by_location["ENTRIESn_hourly"])
Out[142]:
In [145]:
scaled_entries = (data_by_location["ENTRIESn_hourly"]/data_by_location["ENTRIESn_hourly"].std())
In [146]:
plt.scatter(data_by_location["latitude"],data_by_location["longitude"], s=scaled_entries)
Out[146]:
No comments :
Post a Comment