In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
The following code shows some of the timeseries functionalities using Python Pandas Library. I learnt this through Python for Data Analysis book (2nd ed) https://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1491957662
1 Date and Time Data Types¶
In [3]:
from datetime import datetime
In [4]:
datetime.now()
Out[4]:
In [5]:
now = datetime.now()
now.year, now.month, now.day
Out[5]:
In [6]:
delta = datetime.now() - datetime(1992,7,1)
In [7]:
delta
Out[7]:
In [8]:
delta.days, delta.seconds, delta.microseconds
Out[8]:
In [9]:
from datetime import timedelta
In [10]:
datetime.now() + timedelta(20)
Out[10]:
2 Converting between String and Datetime¶
In [11]:
t = datetime(2019,7,1,8,30,0)
t, str(t)
Out[11]:
In [12]:
t.strftime("%Y-%m-%d")
Out[12]:
In [13]:
t.strftime("%Y-%m-%d:%H:%M:%S")
Out[13]:
Now, let's convert the date '01/07/1992' as a datetime type
In [14]:
d = '01/07/1992'
datetime.strptime(d, '%d/%m/%Y')
Out[14]:
Another easy way is to use parser.parse from dateutil library
In [15]:
from dateutil.parser import parse
In [16]:
parse('July 1 1992 8:30 PM')
Out[16]:
In [17]:
parse('01/07/1992', dayfirst=True)
Out[17]:
We can also use pandas to_datetime function
In [18]:
test_dates =['July 1 1992 8:30 pm', '31/7/1992 8:30:00 pm']
In [19]:
pd.to_datetime(test_dates)
Out[19]:
If some date are missing, then pandas library assign the missing value as 'NaT' - Not a time
In [20]:
test_dates =['July 1 1992 8:30 pm', '31/7/1992 8:30:00 pm', None]
In [21]:
pd.to_datetime(test_dates)
Out[21]:
3 Timeseries basics¶
In [22]:
from datetime import datetime
In [23]:
dates = pd.date_range('2019-01-01', '2019-01-15')
In [24]:
dates
Out[24]:
In [25]:
ts = pd.Series(np.random.randn(len(dates)), index = dates)
In [26]:
ts
Out[26]:
In [27]:
ts + ts[::2]
Out[27]:
Indexing , selecting, subsetting¶
In [28]:
ts[0]
Out[28]:
In [29]:
ts['Jan 10 2019']
Out[29]:
In [30]:
ts[:4]
Out[30]:
In [31]:
ts['Jan 1 2019':'Jan 10 2019']
Out[31]:
In [32]:
ts['Jan 2019']
Out[32]:
In [33]:
ts.truncate(before='Jan 10 2019')
Out[33]:
Timeseries with duplicate indices¶
In [34]:
d = pd.DatetimeIndex(['1/1/2019', '1/1/2019', '1/2/2019', '1/2/2019', '1/3/2019', '1/4/2019', '1/4/2019'])
In [35]:
ts = pd.Series(np.arange(len(d)), d)
In [36]:
ts
Out[36]:
In [37]:
ts['1/1/2019']
Out[37]:
In [38]:
ts['1/3/2019']
Out[38]:
In [39]:
ts.groupby(level=0).mean()
Out[39]:
4 Date Ranges, Frequencies and Shifting¶
In [40]:
pd.date_range('1 Jan 2019', '1/31/2019')
Out[40]:
In [41]:
pd.date_range(start = '1 Jan 2019', periods = 10)
Out[41]:
In [42]:
pd.date_range(end = '31 Jan 2019', periods = 10)
Out[42]:
Frequency and date offsets¶
In [43]:
pd.date_range('Jan 1 2019','Jan 3 2019', freq='4h')
Out[43]:
In [44]:
pd.date_range('Jan 1 2019','Jan 3 2019', freq='4h30min')
Out[44]:
Let's try to get the date of second friday every month in 2019
In [45]:
pd.date_range('Jan 1 2019','Dec 31 2019', freq='WOM-2FRI')
Out[45]:
Shifting data¶
In [46]:
ts = pd.Series(np.random.randn(5), index = pd.date_range(start ='Jan 1 2019', periods=5, freq ='MS'))
ts
Out[46]:
In [47]:
ts.shift(2)
Out[47]:
In [48]:
ts.shift(-2)
Out[48]:
In [49]:
ts.shift(2, freq='M')
Out[49]:
In [50]:
ts = pd.Series(np.random.randn(5), index = pd.date_range(start ='Jan 1 2019', periods=5, freq ='MS'))
ts2 = pd.Series(np.random.randn(5), index = pd.date_range(start ='Jan 1 2019', periods=5, freq ='MS'))
In [51]:
t = pd.DataFrame({"d":ts, "e":ts2})
In [52]:
t
Out[52]:
In [53]:
t["d"] = t["d"].shift(1,freq='2MS')
In [54]:
t
Out[54]:
Shifting dates with offsets¶
In [55]:
from pandas.tseries.offsets import Day, MonthEnd
In [56]:
now = datetime.now()
now
Out[56]:
In [57]:
now + 3* Day()
Out[57]:
In [58]:
offset = MonthEnd()
In [59]:
offset.rollforward(now)
Out[59]:
In [60]:
offset.rollback(now)
Out[60]:
5 Time Zone Handling¶
In [61]:
import pytz
In [62]:
pytz.common_timezones[-5:]
Out[62]:
In [63]:
tz = pytz.timezone('America/New_York')
tz
Out[63]:
In pandas, default timezone is None
In [64]:
dates = pd.date_range('Jan 1 2019', periods=6, freq='D')
ts = pd.Series(np.random.randn(len(dates)), index = dates)
ts
Out[64]:
In [65]:
print(ts.index.tz)
In [66]:
pd.date_range('Jan 1 2019 8:30 pm', periods=6, freq='D', tz = 'UTC')
Out[66]:
In [67]:
ts_utc = ts.tz_localize('UTC')
ts_utc
Out[67]:
Once the timeseries is localized to a particular timezone, then we can convert it to other timezone using tz_convert
In [68]:
ts_utc.tz_convert('America/New_York')
Out[68]:
6 Periods and Period Arithmetic¶
In [69]:
p = pd.Period(2019, freq = 'A-JAN')
p
Out[69]:
In [70]:
p+10
Out[70]:
In [71]:
rng = pd.period_range('Jan 1 2019', 'Dec 31 2019', freq = 'M')
In [72]:
rng
Out[72]:
In [73]:
pd.Series(np.random.randn(len(rng)), index=rng)
Out[73]:
Period conversion¶
In [74]:
p = pd.Period('2019', freq = 'A-Dec')
p
Out[74]:
In [75]:
p.asfreq('M')
Out[75]:
In [76]:
p.asfreq('M', how='start')
Out[76]:
7 Resampling and Frequency Conversion¶
In [77]:
rng = pd.date_range('Jan 1 2019', periods = 100, freq = 'D')
In [78]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()
Out[78]:
In [79]:
ts.resample('M').mean()
Out[79]:
In [80]:
ts.resample('M', kind='period').mean()
Out[80]:
Down sampling¶
In [81]:
rng = pd.date_range('Jan 1 2019', periods = 20, freq = 'T')
ts = pd.Series(np.arange(len(rng)), index=rng)
ts
Out[81]:
In [82]:
ts.resample("5T").last()
Out[82]:
In [83]:
ts.resample("5T", closed='left', label = 'right').last()
Out[83]:
Open-High-Low-Close (OHLC resampling)¶
In [84]:
ts
Out[84]:
In [85]:
ts.resample('5T').ohlc()
Out[85]:
Upsampling and Interpolation¶
In [86]:
frame = pd.DataFrame(np.random.randn(2,4), index = pd.date_range('1 Jan 2019', periods = 2, freq = 'W-WED'),
columns = ['A','B','C','D'])
frame
Out[86]:
In [87]:
df_daily = frame.resample('D').asfreq()
df_daily
Out[87]:
In [88]:
frame.resample('D').ffill()
Out[88]:
ffill with limits¶
In [89]:
frame.resample('D').ffill(limit = 2)
Out[89]:
In [90]:
frame.resample('W-THU').ffill()
Out[90]:
In [91]:
frame["A"] = frame["A"].resample('W-THU').ffill()
frame
Out[91]:
8 Moving Window Functions¶
In [92]:
rng = pd.date_range('Jan 1 2019', periods = 50, freq = 'D')
ts = pd.DataFrame(np.arange(len(rng)) + 10* (np.random.randn(len(rng))),
index=rng, columns=["a"])
ts.head()
Out[92]:
In [93]:
ts.shape
Out[93]:
In [94]:
ts["a"].plot()
Out[94]:
In [95]:
ts.tail()
Out[95]:
In [96]:
ts["a"].rolling(window=10).mean()
Out[96]:
In [97]:
ts["a"].plot(label = "Normal")
ts["a"].rolling(window=5).median().plot(label = "Rolling 5 days average")
plt.legend()
Out[97]:
In [98]:
ts["a"].plot(label = "Normal")
ts["a"].expanding().median().plot(label = "Rolling 5 days average")
plt.legend()
Out[98]:
In [99]:
ts["a"].plot(label = "Normal")
ts["a"].rolling(window='7D').median().plot(label = "Rolling 7 days average")
plt.legend()
Out[99]:
No comments :
Post a Comment