In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In [2]:
data = pd.read_csv("melb_data.csv")
In [3]:
data.shape
Out[3]:
In [4]:
data.head()
Out[4]:
In [5]:
data.info()
In [6]:
to_drop = ["BuildingArea","YearBuilt","CouncilArea", "Suburb", "Address", "SellerG","Date" ]
In [7]:
X = data.drop(to_drop, axis=1)
In [8]:
X.info()
FAST AI¶
In [9]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
In [10]:
float_d = []
In [11]:
for c in X.columns[X.dtypes=='float']:
print(c)
float_d.append(c)
In [12]:
for c in X.columns[X.dtypes=='object']:
print(c)
print(X[c].value_counts())
In [13]:
cat_vars = ["Type","Method","Regionname"]
In [14]:
contin_vars = [
'Distance',
'Postcode',
'Bedroom2',
'Bathroom',
'Car',
'Landsize',
'Lattitude',
'Longtitude',
'Propertycount',
'Rooms']
In [15]:
for v in cat_vars:
X[v] = X[v].astype('category').cat.as_ordered()
In [16]:
#apply_cats(X_test,X_train)
In [70]:
df, y, nas, mapper = proc_df(X, 'Price', do_scale=True)
In [71]:
df.shape
Out[71]:
In [72]:
len(y)
Out[72]:
In [73]:
y
Out[73]:
In [74]:
nas
Out[74]:
In [75]:
mapper
Out[75]:
In [22]:
train_ratio = 0.75
samp_size = len(X)
train_size = int(samp_size * train_ratio); train_size
Out[22]:
In [23]:
val_idx = list(range(train_size, len(df)))
In [24]:
val_idx = get_cv_idxs(train_size)
In [25]:
val_idx
Out[25]:
In [26]:
PATH = "data"
In [27]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, np.log(y.astype(np.float32)), cat_flds=cat_vars, bs=128)
In [28]:
cat_sz = [(c, len(X[c].cat.categories)+1) for c in cat_vars]
In [29]:
cat_sz
Out[29]:
In [30]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
In [31]:
emb_szs
Out[31]:
In [32]:
len(df.columns)-len(cat_vars)
Out[32]:
In [33]:
len(contin_vars)
Out[33]:
In [34]:
emd_szs = []
In [35]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars), 0.04, 1, [16,8], [0.01,0.01])
In [36]:
m.lr_find()
In [37]:
m.sched.plot()
In [38]:
lr = 0.1
In [39]:
m.fit(lr, 3)
Out[39]:
In [40]:
m.fit(lr, 3, cycle_len=3, cycle_mult=3)
Out[40]:
In [41]:
y
Out[41]:
In [42]:
x, y = m.predict_with_targs()
In [43]:
x
Out[43]:
In [44]:
y
Out[44]:
In [45]:
plt.scatter(x,y)
Out[45]:
In [46]:
from sklearn.metrics import r2_score
In [47]:
r2_score(x,y)
Out[47]:
In [48]:
np.max(np.exp(y))
Out[48]:
Random Forest¶
In [53]:
from sklearn.ensemble import RandomForestRegressor
In [58]:
val_idx
Out[58]:
In [67]:
from sklearn.model_selection import train_test_split
In [69]:
len(y)
Out[69]:
In [80]:
X_train, X_val, y_train, y_val = train_test_split(df,np.log(y))
In [81]:
m = RandomForestRegressor(n_estimators=40, max_features=0.99, min_samples_leaf=2,
n_jobs=-1, oob_score=True)
m.fit(X_train, y_train);
In [82]:
preds = m.predict(X_val)
m.score(X_train, y_train), m.score(X_val, y_val), m.oob_score_
Out[82]:
In [83]:
from sklearn.metrics import mean_squared_error
In [85]:
np.sqrt(mean_squared_error(y_val,preds))
Out[85]:
No comments :
Post a Comment