Thursday, June 7, 2018

FAST AI Structured


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In [2]:
data = pd.read_csv("melb_data.csv")
In [3]:
data.shape
Out[3]:
(13580, 21)
In [4]:
data.head()
Out[4]:
Suburb Address Rooms Type Price Method SellerG Date Distance Postcode ... Bathroom Car Landsize BuildingArea YearBuilt CouncilArea Lattitude Longtitude Regionname Propertycount
0 Abbotsford 85 Turner St 2 h 1480000.0 S Biggin 3/12/2016 2.5 3067.0 ... 1.0 1.0 202.0 NaN NaN Yarra -37.7996 144.9984 Northern Metropolitan 4019.0
1 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 1900.0 Yarra -37.8079 144.9934 Northern Metropolitan 4019.0
2 Abbotsford 5 Charles St 3 h 1465000.0 SP Biggin 4/03/2017 2.5 3067.0 ... 2.0 0.0 134.0 150.0 1900.0 Yarra -37.8093 144.9944 Northern Metropolitan 4019.0
3 Abbotsford 40 Federation La 3 h 850000.0 PI Biggin 4/03/2017 2.5 3067.0 ... 2.0 1.0 94.0 NaN NaN Yarra -37.7969 144.9969 Northern Metropolitan 4019.0
4 Abbotsford 55a Park St 4 h 1600000.0 VB Nelson 4/06/2016 2.5 3067.0 ... 1.0 2.0 120.0 142.0 2014.0 Yarra -37.8072 144.9941 Northern Metropolitan 4019.0
5 rows × 21 columns
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null float64
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null float64
Bedroom2         13580 non-null float64
Bathroom         13580 non-null float64
Car              13518 non-null float64
Landsize         13580 non-null float64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null float64
dtypes: float64(12), int64(1), object(8)
memory usage: 2.2+ MB
In [6]:
to_drop = ["BuildingArea","YearBuilt","CouncilArea", "Suburb", "Address", "SellerG","Date" ]
In [7]:
X = data.drop(to_drop, axis=1)
In [8]:
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 14 columns):
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null float64
Method           13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null float64
Bedroom2         13580 non-null float64
Bathroom         13580 non-null float64
Car              13518 non-null float64
Landsize         13580 non-null float64
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null float64
dtypes: float64(10), int64(1), object(3)
memory usage: 1.5+ MB

FAST AI

In [9]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
In [10]:
float_d = []
In [11]:
for c in X.columns[X.dtypes=='float']:
    print(c)
    float_d.append(c)
Price
Distance
Postcode
Bedroom2
Bathroom
Car
Landsize
Lattitude
Longtitude
Propertycount
In [12]:
for c in X.columns[X.dtypes=='object']:
    print(c)
    print(X[c].value_counts())
Type
h    9449
u    3017
t    1114
Name: Type, dtype: int64
Method
S     9022
SP    1703
PI    1564
VB    1199
SA      92
Name: Method, dtype: int64
Regionname
Southern Metropolitan         4695
Northern Metropolitan         3890
Western Metropolitan          2948
Eastern Metropolitan          1471
South-Eastern Metropolitan     450
Eastern Victoria                53
Northern Victoria               41
Western Victoria                32
Name: Regionname, dtype: int64
In [13]:
cat_vars = ["Type","Method","Regionname"]
In [14]:
contin_vars = [
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount',
 'Rooms']
In [15]:
for v in cat_vars:
    X[v] = X[v].astype('category').cat.as_ordered()
In [16]:
#apply_cats(X_test,X_train)
In [70]:
df, y, nas, mapper = proc_df(X, 'Price', do_scale=True)
In [71]:
df.shape
Out[71]:
(13580, 14)
In [72]:
len(y)
Out[72]:
13580
In [73]:
y
Out[73]:
array([ 1480000.,  1035000.,  1465000.,   850000.,  1600000.,   941000.,  1876000.,  1636000.,   300000.,
        1097000.,   700000.,  1350000.,   750000.,  1172500.,   441000.,  1310000.,  1200000.,  1176500.,
         955000.,   890000., ...,  1400000.,  1288000.,  1450000.,  1271000.,   540000.,  1263500.,  1250000.,
        1316000.,   951000.,  1323000.,   970000.,  1330000.,   650000.,   635000.,   582000.,  1245000.,
        1031000.,  1170000.,  2500000.,  1285000.])
In [74]:
nas
Out[74]:
{'Car': 2.0}
In [75]:
mapper
Out[75]:
DataFrameMapper(default=False, df_out=False,
        features=[(['Rooms'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['Distance'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['Postcode'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['Bedroom2'], StandardScaler(copy=True, with_mean=True, with_std=True)..._mean=True, with_std=True)), (['Car_na'], StandardScaler(copy=True, with_mean=True, with_std=True))],
        input_df=False, sparse=False)
In [22]:
train_ratio = 0.75
samp_size = len(X)
train_size = int(samp_size * train_ratio); train_size
Out[22]:
10185
In [23]:
val_idx = list(range(train_size, len(df)))
In [24]:
val_idx = get_cv_idxs(train_size)
In [25]:
val_idx
Out[25]:
array([ 3898,  6111,  4123,   718,  3132,  6206,  4374,  8185,  1817,   582,  7717,  3445,  1175,  8849,
          33,   568,  9455,  3007,  4352,  7770, ...,  9458,  4786,  1400,  4224,  8535,  7772,  9950,  7800,
        2377,  8995,  2088,  7382, 10112,  7615,  9406,  3078, 10159,  5736,  4191,  3717])
In [26]:
PATH = "data"
In [27]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, np.log(y.astype(np.float32)), cat_flds=cat_vars, bs=128)
In [28]:
cat_sz = [(c, len(X[c].cat.categories)+1) for c in cat_vars]
In [29]:
cat_sz
Out[29]:
[('Type', 4), ('Method', 6), ('Regionname', 9)]
In [30]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
In [31]:
emb_szs
Out[31]:
[(4, 2), (6, 3), (9, 5)]
In [32]:
len(df.columns)-len(cat_vars)
Out[32]:
11
In [33]:
len(contin_vars)
Out[33]:
10
In [34]:
emd_szs = []
In [35]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars), 0.04, 1, [16,8], [0.01,0.01])
In [36]:
m.lr_find()
epoch      trn_loss   val_loss                             
    0      85.509138  51.535043 

In [37]:
m.sched.plot()
In [38]:
lr = 0.1
In [39]:
m.fit(lr, 3)
epoch      trn_loss   val_loss                             
    0      4.97803    0.222929  
    1      1.536503   0.172182                             
    2      0.983754   0.170886                              

Out[39]:
[array([ 0.17089])]
In [40]:
m.fit(lr, 3, cycle_len=3, cycle_mult=3)
epoch      trn_loss   val_loss                              
    0      0.873391   0.152719  
    1      0.701459   0.106976                              
    2      0.507177   0.084831                              
    3      0.42357    0.238662                              
    4      0.440074   0.09528                               
    5      0.288478   0.078033                              
    6      0.225179   0.074947                              
    7      0.200226   0.07474                               
    8      0.188656   0.07726                               
    9      0.182667   0.070005                              
    10     0.180889   0.070441                              
    11     0.166218   0.070353                              
    12     0.153288   0.077664                              
    13     0.137262   0.081601                              
    14     0.115143   0.070598                              
    15     0.09773    0.089384                               
    16     0.088211   0.079999                               
    17     0.080388   0.069134                               
    18     0.07391    0.069565                               
    19     0.07321    0.069503                               
    20     0.068977   0.072013                               
    21     0.068454   0.065636                               
    22     0.066511   0.066473                               
    23     0.065992   0.067145                               
    24     0.066536   0.064824                               
    25     0.066065   0.066115                               
    26     0.066642   0.065295                               
    27     0.067432   0.066224                               
    28     0.066695   0.065526                               
    29     0.065052   0.064522                               
    30     0.064709   0.065304                               
    31     0.063949   0.063796                               
    32     0.065162   0.065178                               
    33     0.063537   0.064171                               
    34     0.066115   0.063689                               
    35     0.064096   0.063703                               
    36     0.063267   0.063604                               
    37     0.063445   0.063637                               
    38     0.06427    0.063566                               

Out[40]:
[array([ 0.06357])]
In [41]:
y
Out[41]:
array([ 1480000.,  1035000.,  1465000.,   850000.,  1600000.,   941000.,  1876000.,  1636000.,   300000.,
        1097000.,   700000.,  1350000.,   750000.,  1172500.,   441000.,  1310000.,  1200000.,  1176500.,
         955000.,   890000., ...,  1400000.,  1288000.,  1450000.,  1271000.,   540000.,  1263500.,  1250000.,
        1316000.,   951000.,  1323000.,   970000.,  1330000.,   650000.,   635000.,   582000.,  1245000.,
        1031000.,  1170000.,  2500000.,  1285000.])
In [42]:
x, y = m.predict_with_targs()
In [43]:
x
Out[43]:
array([[ 14.08092],
       [ 14.22979],
       [ 13.03424],
       [ 13.36845],
       [ 13.52134],
       [ 12.97773],
       [ 14.08708],
       [ 14.45369],
       [ 13.36494],
       [ 14.20508],
       [ 14.38188],
       [ 13.04502],
       [ 14.30678],
       [ 14.03919],
       [ 14.10494],
       [ 13.45855],
       [ 13.96307],
       [ 12.9465 ],
       [ 12.95961],
       [ 13.5396 ],
       ..., 
       [ 13.20871],
       [ 14.47329],
       [ 13.94839],
       [ 14.15746],
       [ 14.14345],
       [ 13.91946],
       [ 13.40983],
       [ 14.1548 ],
       [ 14.02417],
       [ 14.34953],
       [ 13.73072],
       [ 13.3561 ],
       [ 14.10077],
       [ 14.66769],
       [ 14.63131],
       [ 14.54948],
       [ 13.27073],
       [ 14.19503],
       [ 13.59988],
       [ 14.37383]], dtype=float32)
In [44]:
y
Out[44]:
array([[ 14.20755],
       [ 13.65299],
       [ 12.61154],
       [ 13.45884],
       [ 13.52783],
       [ 12.9968 ],
       [ 13.69898],
       [ 14.10069],
       [ 13.12236],
       [ 13.94214],
       [ 14.07015],
       [ 13.06049],
       [ 13.99366],
       [ 13.82793],
       [ 13.84507],
       [ 13.5008 ],
       [ 13.49393],
       [ 13.18063],
       [ 13.0444 ],
       [ 13.41503],
       ..., 
       [ 13.13231],
       [ 14.4307 ],
       [ 13.67048],
       [ 14.13032],
       [ 13.89432],
       [ 13.95527],
       [ 13.30801],
       [ 14.27924],
       [ 14.67713],
       [ 14.60397],
       [ 13.56705],
       [ 12.94801],
       [ 13.997  ],
       [ 14.37558],
       [ 14.64842],
       [ 14.74964],
       [ 13.29295],
       [ 14.36479],
       [ 13.45884],
       [ 14.47303]], dtype=float32)
In [45]:
plt.scatter(x,y)
Out[45]:
<matplotlib.collections.PathCollection at 0x1212ee320>
In [46]:
from sklearn.metrics import r2_score
In [47]:
r2_score(x,y)
Out[47]:
0.71178782442414246
In [48]:
np.max(np.exp(y))
Out[48]:
7999998.0

Random Forest

In [53]:
from sklearn.ensemble import RandomForestRegressor
In [58]:
val_idx
Out[58]:
array([ 3898,  6111,  4123,   718,  3132,  6206,  4374,  8185,  1817,   582,  7717,  3445,  1175,  8849,
          33,   568,  9455,  3007,  4352,  7770, ...,  9458,  4786,  1400,  4224,  8535,  7772,  9950,  7800,
        2377,  8995,  2088,  7382, 10112,  7615,  9406,  3078, 10159,  5736,  4191,  3717])
In [67]:
from sklearn.model_selection import train_test_split
In [69]:
len(y)
Out[69]:
2037
In [80]:
X_train, X_val, y_train, y_val = train_test_split(df,np.log(y))
In [81]:
m = RandomForestRegressor(n_estimators=40, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)
m.fit(X_train, y_train);
In [82]:
preds = m.predict(X_val)
m.score(X_train, y_train), m.score(X_val, y_val), m.oob_score_
Out[82]:
(0.96383356731045911, 0.85797784333100791, 0.85581215788657428)
In [83]:
from sklearn.metrics import mean_squared_error
In [85]:
np.sqrt(mean_squared_error(y_val,preds))
Out[85]:
0.19796133899122037

No comments :

Post a Comment