# 基于ARMA-EGARCH模型的比特币波动率建模和分析

Author: congcong009, Created: 2020-07-09 21:38:59, Updated: 2020-07-10 10:24:40

最近对比特币的波动率做了一些分析，中间啰啰嗦嗦有些心血来潮的分析，索性将一些理解和代码分享如下，本人能力有限，代码写的脏乱差，如有错误，还请社区大佬多多指（Du）正（Da）。
1、金融的时间序列简述


1-1、准备工作，导入库，封装函数

腐妹子的研究环境配置很齐全，这里就导入后续计算所需要的库，因为是断断续续写完，所以可能导入上有些冗余，请自行cleanup

'''
start: 2020-02-01 00:00:00
end: 2020-03-01 00:00:00
period: 1h
exchanges: [{"eid":"Huobi","currency":"BTC_USDT","stocks":0}]
'''
from __future__ import absolute_import, division, print_function
from fmz import * # 导入所有FMZ函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
from statsmodels.graphics.api import qqplot
from statsmodels.stats.diagnostic import acorr_ljungbox as lb_test
from scipy import stats
from arch import arch_model
from datetime import timedelta
from itertools import product
from math import sqrt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print(exchange.GetAccount())
{'Balance': 10000.0, 'FrozenBalance': 0.0, 'Stocks': 0.0, 'FrozenStocks': 0.0}

封装部分函数，后面会用，如有来源见注释

# 绘图函数
def tsplot(y, y_2, lags=None, title='', figsize=(18, 8)):  # 源码: https://tomaugspurger.github.io/modern-7-timeseries.html
fig = plt.figure(figsize=figsize)
layout = (2, 2)
ts_ax   = plt.subplot2grid(layout, (0, 0))
ts2_ax = plt.subplot2grid(layout, (0, 1))
acf_ax  = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
y.plot(ax=ts_ax)
ts_ax.set_title(title)
y_2.plot(ax=ts2_ax)
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
[ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
sns.despine()
plt.tight_layout()
return ts_ax, ts2_ax, acf_ax, pacf_ax

# 性能评价
def get_rmse(y, y_hat):
mse = np.mean((y - y_hat)**2)
return np.sqrt(mse)

def get_mape(y, y_hat):
perc_err = (100*(y - y_hat))/y
return np.mean(abs(perc_err))

def get_mase(y, y_hat):
abs_err = abs(y - y_hat)
dsum=sum(abs(y[1:] - y_hat[1:]))
t = len(y)
denom = (1/(t - 1))* dsum
return np.mean(abs_err/denom)

def mae(observation, forecast):
error = mean_absolute_error(observation, forecast)
print('Mean Absolute Error (MAE): {:.3g}'.format(error))
return error

def mape(observation, forecast):
observation, forecast = np.array(observation), np.array(forecast)
# Might encounter division by zero error when observation is zero
error = np.mean(np.abs((observation - forecast) / observation)) * 100
print('Mean Absolute Percentage Error (MAPE): {:.3g}'.format(error))
return error

def rmse(observation, forecast):
error = sqrt(mean_squared_error(observation, forecast))
print('Root Mean Square Error (RMSE): {:.3g}'.format(error))
return error

def evaluate(pd_dataframe, observation, forecast):
first_valid_date = pd_dataframe[forecast].first_valid_index()
mae_error = mae(pd_dataframe[observation].loc[first_valid_date:, ], pd_dataframe[forecast].loc[first_valid_date:, ])
mape_error = mape(pd_dataframe[observation].loc[first_valid_date:, ], pd_dataframe[forecast].loc[first_valid_date:, ])
rmse_error = rmse(pd_dataframe[observation].loc[first_valid_date:, ], pd_dataframe[forecast].loc[first_valid_date:, ])

ax = pd_dataframe.loc[:, [observation, forecast]].plot(figsize=(18,5))
ax.xaxis.label.set_visible(False)
return
1-2、先粗略了解比特币的历史数据

$$r_{t}=\ln ^{\frac{P_{t}}{P_{t-1}}}$$

df = get_bars('huobi.btc_usdt', '1d', count=10000, start='2019-01-01')
btc_year = pd.DataFrame(df['close'],dtype=np.float)
btc_year.index.name = 'date'
btc_year.index = pd.to_datetime(btc_year.index)
btc_year['log_price'] = np.log(btc_year['close'])
btc_year['log_return'] = btc_year['log_price'] - btc_year['log_price'].shift(1)
btc_year['log_return_100x'] = np.multiply(btc_year['log_return'], 100)

btc_year_test = pd.DataFrame(btc_year['log_return'].dropna(), dtype=np.float)
btc_year_test.index.name = 'date'
mean = btc_year_test.mean()
std = btc_year_test.std()
normal_result = pd.DataFrame(index=['Mean Value', 'Std Value', 'Skewness Value','Kurtosis Value'], columns=['model value'])
normal_result['model value']['Mean Value'] = ('%.4f'% mean[0])
normal_result['model value']['Std Value'] = ('%.4f'% std[0])
normal_result['model value']['Skewness Value'] = ('%.4f'% btc_year_test.skew())
normal_result['model value']['Kurtosis Value'] = ('%.4f'% btc_year_test.kurt())
normal_result
               model value
Mean Value          0.0016
Std Value           0.0341
Skewness Value     -0.6806
Kurtosis Value      7.2376
尖峰厚尾（Thick fat tails）特征是时间标度越短特征越显著，峰度会随着数据频率的增加而增加，在高频数据下特征会十分明显。


fig = plt.figure(figsize=(4,4))
fig = qqplot(btc_year_test['log_return'], line='q', ax=ax, fit=True)
可以看到，QQ图很漂亮，从结果上看，比特币的对数收益率序列不符合正态分布，并且具有明显 “尖峰厚尾”的特征。


df = get_bars('huobi.btc_usdt', '1d', count=50000, start='2006-01-01')
btc_year = pd.DataFrame(df['close'],dtype=np.float)
btc_year.index.name = 'date'
btc_year.index = pd.to_datetime(btc_year.index)
btc_year['log_price'] = np.log(btc_year['close'])
btc_year['log_return'] = btc_year['log_price'] - btc_year['log_price'].shift(1)
btc_year['log_return_100x'] = np.multiply(btc_year['log_return'], 100)

btc_year_test = pd.DataFrame(btc_year['log_return'].dropna(), dtype=np.float)
btc_year_test.index.name = 'date'
sns.mpl.rcParams['figure.figsize'] = (18, 4)  # 波动率
ax1 = btc_year_test['log_return'].plot()
ax1.xaxis.label.set_visible(False)
取最近3年比特币的日对数收益率并绘制出来，可以明显地看到波动集聚现象。2018年比特币经历了一波大牛市之后，大部分时间都处于平稳姿态。图中最右侧也能看到，2020年3月由于全球金融市场重挫，比特币的流动性也出现挤兑，收益率1天之内暴跌近40%，出现了剧烈的负向波动。


1-3、数据准备

$$RV_{d_{j+1}}=\sum_{i=1}^{60} r_{t_{t_{i+60}(j+1)}^{2}}$$

count_num = 100000
start_date = '2020-03-01'

df = get_bars('huobi.btc_usdt', '1m', count=50000, start='2020-02-13')  # 取分钟数据
kline_1m = pd.DataFrame(df['close'], dtype=np.float)
kline_1m.index.name = 'date'
kline_1m['log_price'] = np.log(kline_1m['close'])
kline_1m['return'] = kline_1m['close'].pct_change().dropna()
kline_1m['log_return'] = kline_1m['log_price'] - kline_1m['log_price'].shift(1)

kline_1m['squared_log_return'] = np.power(kline_1m['log_return'], 2)

kline_1m['return_100x'] = np.multiply(kline_1m['return'], 100)
kline_1m['log_return_100x'] = np.multiply(kline_1m['log_return'], 100)  # 放大100倍

df = get_bars('huobi.btc_usdt', '1h', count=860, start='2020-02-13')  # 取小时数据
kline_all = pd.DataFrame(df['close'], dtype=np.float)
kline_all.index.name = 'date'
kline_all['log_price'] = np.log(kline_all['close'])  # 计算每日对数收益率
kline_all['return'] = kline_all['log_price'].pct_change().dropna()
kline_all['log_return'] = kline_all['log_price'] - kline_all['log_price'].shift(1)  # 计算对数收益率
kline_all['squared_log_return'] = np.power(kline_all['log_return'], 2)  # 对数日收益率的指数平方

kline_all['return_100x'] = np.multiply(kline_all['return'], 100)
kline_all['log_return_100x'] = np.multiply(kline_all['log_return'], 100)  # 放大100倍

kline_all['realized_variance_1_hour'] = kline_1m.loc[:, 'squared_log_return'].resample('h', closed='left', label='left').sum().copy() # 重采样至天
kline_all['realized_volatility_1_hour'] = np.sqrt(kline_all['realized_variance_1_hour'])  # 方差开方得波动率

kline_all = kline_all[4:-29] # 由于最后一行缺失，因此去除最后一行
kline_all.head(3)
                             close  log_price    return  log_return  \
date
2020-02-13 12:00:00+08:00  10397.1   9.249282  0.000152    0.001405
2020-02-13 13:00:00+08:00  10438.0   9.253208  0.000424    0.003926
2020-02-13 14:00:00+08:00  10449.0   9.254262  0.000114    0.001053

squared_log_return  return_100x  log_return_100x  \
date
2020-02-13 12:00:00+08:00            0.000002     0.015195         0.140522
2020-02-13 13:00:00+08:00            0.000015     0.042447         0.392607
2020-02-13 14:00:00+08:00            0.000001     0.011383         0.105329

realized_variance_1_hour  \
date
2020-02-13 12:00:00+08:00                  0.000016
2020-02-13 13:00:00+08:00                  0.000007
2020-02-13 14:00:00+08:00                  0.000025

realized_volatility_1_hour
date
2020-02-13 12:00:00+08:00                    0.003962
2020-02-13 13:00:00+08:00                    0.002581
2020-02-13 14:00:00+08:00                    0.004995  
顺便准备下样本外数据，方式雷同

# 准备样本外数据，带已实现日波动率
df = get_bars('huobi.btc_usdt', '1m', count=50000, start='2020-02-13')  # 取分钟数据
kline_1m = pd.DataFrame(df['close'], dtype=np.float)
kline_1m.index.name = 'date'
kline_1m['log_price'] = np.log(kline_1m['close'])
kline_1m['log_return'] = kline_1m['log_price'] - kline_1m['log_price'].shift(1)
kline_1m['log_return_100x'] = np.multiply(kline_1m['log_return'], 100)  # 放大100倍
kline_1m['squared_log_return'] = np.power(kline_1m['log_return_100x'], 2)
kline_1m#.tail()
df = get_bars('huobi.btc_usdt', '1h', count=860, start='2020-02-13')  # 取小时数据
kline_test = pd.DataFrame(df['close'], dtype=np.float)
kline_test.index.name = 'date'
kline_test['log_price'] = np.log(kline_test['close'])  # 计算每日对数收益率
kline_test['log_return'] = kline_test['log_price'] - kline_test['log_price'].shift(1)  # 计算对数收益率
kline_test['log_return_100x'] = np.multiply(kline_test['log_return'], 100)  # 放大100倍
kline_test['squared_log_return'] = np.power(kline_test['log_return_100x'], 2)  # 对数日收益率的指数平方
kline_test['realized_variance_1_hour'] = kline_1m.loc[:, 'squared_log_return'].resample('h', closed='left', label='left').sum() # 重采样至天
kline_test['realized_volatility_1_hour'] = np.sqrt(kline_test['realized_variance_1_hour'])  # 方差开方得波动率
kline_test = kline_test[4:-2]
为了理解样本的数据基础情况，做一个简单的描述性分析，如下：

line_test = pd.DataFrame(kline_train['log_return'].dropna(), dtype=np.float)
line_test.index.name = 'date'

mean = line_test.mean()  # 计算均值，标准差
std = line_test.std()

line_test.sort_values(by = 'log_return', inplace = True)  # 重新排序
s_r = line_test.reset_index(drop = False)  # 重新排序后，更新index
s_r['p'] = (s_r.index - 0.5) / len(s_r)  # 计算百分位数 p(i)
s_r['q'] = (s_r['log_return'] - mean) / std  # 计算q值

st = line_test.describe()
x1 ,y1 = 0.25, st['log_return']['25%']
x2 ,y2 = 0.75, st['log_return']['75%']

fig = plt.figure(figsize = (18,8))
layout = (2, 2)
ax1 = plt.subplot2grid(layout, (0, 0), colspan=2)# 绘制数据分布图
ax2 = plt.subplot2grid(layout, (1, 0))# 绘制直方图
ax3 = plt.subplot2grid(layout, (1, 1))# 绘制QQ图，直线为四分之一位数、四分之三位数的连线，基本符合正态分布

ax1.scatter(line_test.index, line_test.values)
line_test.hist(bins=30,alpha = 0.5,ax = ax2)
line_test.plot(kind = 'kde', secondary_y=True,ax = ax2)
ax3.plot(s_r['p'],s_r['log_return'],'k.',alpha = 0.1)
ax3.plot([x1,x2],[y1,y2],'-r')
sns.despine()
plt.tight_layout()
从结果上看，对数收益率的时序图形中，具有明显的波动聚集和杠杆效应。


line_test = pd.DataFrame(kline_all['log_return'].dropna(), dtype=np.float)
line_test.index.name = 'date'
mean = line_test.mean()
std = line_test.std()
normal_result = pd.DataFrame(index=['Mean Value', 'Std Value', 'Skewness Value','Kurtosis Value',
'Ks Test Value','Ks Test P-value',
'Jarque Bera Test','Jarque Bera Test P-value'],
columns=['model value'])
normal_result['model value']['Mean Value'] = ('%.4f'% mean[0])
normal_result['model value']['Std Value'] = ('%.4f'% std[0])
normal_result['model value']['Skewness Value'] = ('%.4f'% line_test.skew())
normal_result['model value']['Kurtosis Value'] = ('%.4f'% line_test.kurt())
normal_result['model value']['Ks Test Value'] = stats.kstest(line_test, 'norm', (mean, std))[0]
normal_result['model value']['Ks Test P-value'] = stats.kstest(line_test, 'norm', (mean, std))[1]
normal_result['model value']['Jarque Bera Test'] = stats.jarque_bera(line_test)[0]
normal_result['model value']['Jarque Bera Test P-value'] = stats.jarque_bera(line_test)[1]
normal_result
                         model value
Mean Value                   -0.0008
Std Value                     0.0167
Skewness Value               -2.4206
Kurtosis Value               57.1471
Ks Test Value                      1
Ks Test P-value                    0
Jarque Bera Test              111956
Jarque Bera Test P-value           0
分别采用Kolmogorov - Smirnov和Jarque - Bera检验统计量，原假设分别是具备显著性差异和正态分布特性，如果P值小于0.05%置信水平临界值，则拒绝原假设。


1-4、已实现波动率和观测波动率的比较


fig, ax = plt.subplots(figsize=(18, 6))
start = '2020-03-08 00:00:00+08:00'
end = '2020-03-20 00:00:00+08:00'

np.abs(kline_all['squared_log_return']).loc[start:end].plot(ax=ax,label='squared log return')
kline_all['realized_variance_1_hour'].loc[start:end].plot(ax=ax,label='realized variance')
plt.legend(loc='best')
可以看到已实现方差幅度更大的时候，收益率的幅度也较高，已实现收益率更平滑，两者都容易观察到明显的聚集效应。


2、时间序列的平稳性

$$\begin{eqnarray} &&AIC = -2 \ln L + 2 k \ &&BIC = -2 \ln L + \ln n \cdot k \ \end{eqnarray}$$

stable_test = kline_all['log_return']
"Critical Value(1%)","Critical Value(5%)","Critical Value(10%)"],
columns=['AIC','BIC'])
output
                                  AIC      BIC
ADF Statistic Test Value     -5.66129 -31.7186
Lags                               19        0
Number of Observations            807      826
Critical Value(1%)           -3.43848 -3.43829
Critical Value(5%)           -2.86513 -2.86505
Critical Value(10%)          -2.56868 -2.56864
原假设为序列不存在单位根，即非平稳性，备择假设序列是平稳的。检验P值远小于0.05%的置信水平临界值，拒绝原假设，所以对数收益率为平稳序列，可以使用统计时序模型进行建模。

3、模型识别和定阶


tsplot(kline_all['log_return'], kline_all['log_return'], title='Log Return', lags=100)
可以看到截尾的效果很漂亮，有那么一瞬间这图给了我一个启发，那就是市场真是无效的么？为了验证，接着对收益率序列做自相关分析，并且还要确定模型的滞后阶数。

$$\rho_l = \frac{Cov(r_t, r_{t-l})}{\sqrt{Var(r_t)Var(r_{t-l})}} = \frac{Cov(r_t, _{t-l})}{Var(r_t)}$$

$$Q(m) = T(T+2)\sum_{l=1}^{m} \frac{\hat{\rho}_l^2}{T-l}$$


acf,q,p = sm.tsa.acf(kline_all['log_return'], nlags=15,unbiased=True,qstat = True, fft=False)  # 检验10个自相关系数
output = pd.DataFrame(np.c_[range(1,16), acf[1:], q, p], columns=['lag', 'ACF', 'Q', 'P-value'])
output = output.set_index('lag')
output
           ACF          Q   P-value
lag
1.0  -0.099665   8.244485  0.004088
2.0  -0.015185   8.436110  0.014727
3.0   0.005439   8.460726  0.037390
4.0  -0.007983   8.513819  0.074469
5.0   0.015917   8.725129  0.120543
6.0  -0.102288  17.462134  0.007727
7.0   0.032860  18.364914  0.010428
8.0  -0.090556  25.229481  0.001421
9.0  -0.035358  26.277304  0.001840
10.0  0.045424  28.008737  0.001799
11.0 -0.014142  28.176779  0.003041
12.0  0.036273  29.283577  0.003575
13.0  0.074563  33.966146  0.001218
14.0  0.069355  38.022428  0.000516
15.0  0.129860  52.260608  0.000005
根据检验统计量Q和P值可以看到，自相关函数ACF在0阶后逐渐为0，Q检验统计量的P值都足够小，可以拒绝原假设，故序列存在自相关性。
4、ARMA建模
AR和MA模型挺简单的，简单说下，Markdown写公式太累了，有兴趣的大家自己查吧。AR（Autoregression，自回归）模型主要是对时间序列进行建模，如果序列已经通过了ACF检验，也就是间隔为1的自相关系数显著，即在$t-1$时刻的数据$r_(t-1)$，对预测t时刻$r_t$时可能是有用的。
MA（Moving Average，滑动平均）模型是使用过去q个时期的随机干扰或预测误差，来线性表达当前的预测值。

4-1、定阶


def select_best_params():
ps = range(0, 4)
ds= range(1, 2)
qs = range(0, 4)
parameters = product(ps, ds, qs)
parameters_list = list(parameters)

p_min = 0
d_min = 0
q_min = 0
p_max = 3
d_max = 3
q_max = 3

results_aic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)],
columns=['MA{}'.format(i) for i in range(q_min,q_max+1)])
results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)],
columns=['MA{}'.format(i) for i in range(q_min,q_max+1)])

best_params = []
aic_results = []
bic_results = []
hqic_results = []
best_aic = float("inf")
best_bic = float("inf")
best_hqic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
try:
model = sm.tsa.SARIMAX(kline_all['log_price'], order=(param[0], param[1], param[2])).fit(disp=-1)
results_aic.loc['AR{}'.format(param[0]), 'MA{}'.format(param[2])] = model.aic
results_bic.loc['AR{}'.format(param[0]), 'MA{}'.format(param[2])] = model.bic
except ValueError:
continue

aic_results.append([param, model.aic])
bic_results.append([param, model.bic])
hqic_results.append([param, model.hqic])

results_aic = results_aic[results_aic.columns].astype(float)
results_bic = results_bic[results_bic.columns].astype(float)

# 绘制AIC和BIC的热力图，寻找最优
fig = plt.figure(figsize=(18, 9))
layout = (2, 2)
aic_ax = plt.subplot2grid(layout, (0, 0))
bic_ax = plt.subplot2grid(layout, (0, 1))

aic_ax.set_title('AIC');
bic_ax.set_title('BIC');

aic_df = pd.DataFrame(aic_results)
aic_df.columns = ['params', 'aic']
best_params.append(aic_df.params[aic_df.aic.idxmin()])
print('AIC best param: {}'.format(aic_df.params[aic_df.aic.idxmin()]))

bic_df = pd.DataFrame(bic_results)
bic_df.columns = ['params', 'bic']
best_params.append(bic_df.params[bic_df.bic.idxmin()])
print('BIC best param: {}'.format(bic_df.params[bic_df.bic.idxmin()]))

hqic_df = pd.DataFrame(hqic_results)
hqic_df.columns = ['params', 'hqic']
best_params.append(hqic_df.params[hqic_df.hqic.idxmin()])
print('HQIC best param: {}'.format(hqic_df.params[hqic_df.hqic.idxmin()]))

for best_param in best_params:
if best_params.count(best_param)>=2:
print('Best Param Selected: {}'.format(best_param))
return best_param

best_param = select_best_params()
AIC best param: (0, 1, 1)
BIC best param: (0, 1, 1)
HQIC best param: (0, 1, 1)
Best Param Selected: (0, 1, 1)

<Figure size 1296x648 with 4 Axes>
可以明显看到，对对数价格的最优1阶参数组合为(0,1,1)，简单粗暴。对log_return（对数收益率）执行同样运算，AIC最优为(4,3)，BIC最优为(0,1)。所以对log_return（对数收益率）取最优的参数组合为(0,1)。
4-2、ARMA建模拟合


params = (0, 0, 1)
training_model = smt.SARIMAX(endog=kline_all['log_return'], trend='c', order=params, seasonal_order=(0, 0, 0, 0))
model_results = training_model.fit(disp=False)
model_results.summary()
<class 'statsmodels.iolib.summary.Summary'>
"""
Statespace Model Results
==============================================================================
Dep. Variable:             log_return   No. Observations:                  827
Model:               SARIMAX(0, 0, 1)   Log Likelihood                2215.932
Date:                Fri, 10 Jul 2020   AIC                          -4425.864
Time:                        02:08:38   BIC                          -4411.710
Sample:                    02-13-2020   HQIC                         -4420.435
- 03-18-2020
Covariance Type:                  opg
==============================================================================
coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept     -0.0008      0.001     -1.399      0.162      -0.002       0.000
ma.L1         -0.1021      0.010    -10.163      0.000      -0.122      -0.082
sigma2         0.0003   2.82e-06     97.828      0.000       0.000       0.000
===================================================================================
Ljung-Box (Q):                      104.77   Jarque-Bera (JB):            106398.74
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):              24.60   Skew:                            -2.68
Prob(H) (two-sided):                  0.00   Kurtosis:                        58.31
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
"""
model_results.plot_diagnostics(figsize=(18, 10));
直方图中概率密度KDE和正态分布N(0,1)比较远，表明残差不是正态分布。QQ分位图中，从标准正态分布抽取的样本中，残差没有完全服从线性趋势，所以再次确认残差不是正态分布，更接近白噪声。

4-3、模型检验


het_method='breakvar'
norm_method='jarquebera'
sercor_method='ljungbox'

(het_stat, het_p) = model_results.test_heteroskedasticity(het_method)[0]
norm_stat, norm_p, skew, kurtosis = model_results.test_normality(norm_method)[0]

sercor_stat, sercor_p = model_results.test_serial_correlation(method=sercor_method)[0]
sercor_stat = sercor_stat[-1] # 最大周期的最后一个值
sercor_p = sercor_p[-1]

dw = sm.stats.stattools.durbin_watson(model_results.filter_results.standardized_forecasts_error[0, model_results.loglikelihood_burn:])

arroots_outside_unit_circle = np.all(np.abs(model_results.arroots) > 1)
maroots_outside_unit_circle = np.all(np.abs(model_results.maroots) > 1)

print('Test heteroskedasticity of residuals ({}): stat={:.3f}, p={:.3f}'.format(het_method, het_stat, het_p));
print('\nTest normality of residuals ({}): stat={:.3f}, p={:.3f}'.format(norm_method, norm_stat, norm_p));
print('\nTest serial correlation of residuals ({}): stat={:.3f}, p={:.3f}'.format(sercor_method, sercor_stat, sercor_p));
print('\nDurbin-Watson test on residuals: d={:.2f}\n\t(NB: 2 means no serial correlation, 0=pos, 4=neg)'.format(dw))
print('\nTest for all AR roots outside unit circle (>1): {}'.format(arroots_outside_unit_circle))
print('\nTest for all MA roots outside unit circle (>1): {}'.format(maroots_outside_unit_circle))

root_test=pd.DataFrame(index=['Durbin-Watson test on residuals','Test for all AR roots outside unit circle (>1)','Test for all MA roots outside unit circle (>1)'],columns=['c'])
root_test['c']['Durbin-Watson test on residuals']=dw
root_test['c']['Test for all AR roots outside unit circle (>1)']=arroots_outside_unit_circle
root_test['c']['Test for all MA roots outside unit circle (>1)']=maroots_outside_unit_circle
root_test
Test heteroskedasticity of residuals (breakvar): stat=24.598, p=0.000

Test normality of residuals (jarquebera): stat=106398.739, p=0.000

Test serial correlation of residuals (ljungbox): stat=104.767, p=0.000

Durbin-Watson test on residuals: d=2.00
(NB: 2 means no serial correlation, 0=pos, 4=neg)

Test for all AR roots outside unit circle (>1): True

Test for all MA roots outside unit circle (>1): True

c
Durbin-Watson test on residuals                 1.99996
Test for all AR roots outside unit circle (>1)     True
Test for all MA roots outside unit circle (>1)     True
kline_all['log_price_dif1'] = kline_all['log_price'].diff(1)
kline_all = kline_all[1:]
kline_train = kline_all

training_label = 'log_return'
training_ts = pd.DataFrame(kline_train[training_label], dtype=np.float)

delta = model_results.fittedvalues - training_ts[training_label]
adjR_test
             Value
adjR2  0.000106429
Durbin-Watson测试统计量如果等于2，确认序列没有相关性，其统计值分布在(0,4)之间，接近0意味着正相关性较高，反之接近4意味着负相关性较高。这里约等于2。其他检验的P值足够小，单位特征根在单位圆之外，修正$adjR²$的值越大越好，整体看来，计量结果并不理想。

model_results.params
intercept   -0.000817
ma.L1       -0.102102
sigma2       0.000275
dtype: float64
综上所述，该定阶参数可以基本上满足对时间序列的建模，支持后续波动率建模的需求，但拟合效果一般。模型表达式如下：
$$r_t= -0.102102r_(t-1)+0.000275ε_(t-1)-0.000817$$
4-4、模型预测


start_date = '2020-02-28 12:00:00+08:00'
end_date = start_date

pred_dy = model_results.get_prediction(start=start_date, dynamic=False)
pred_dy_ci = pred_dy.conf_int()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(18, 6))
ax.plot(kline_all['log_return'].loc[start_date:], label='Log Return', linestyle='-')
ax.plot(pred_dy.predicted_mean.loc[start_date:], label='Forecast Log Return', linestyle='--')
ax.fill_between(pred_dy_ci.index,pred_dy_ci.iloc[:, 0],pred_dy_ci.iloc[:, 1], color='g', alpha=.25)

plt.ylabel("BTC Log Return")
plt.legend(loc='best')
plt.tight_layout()
sns.despine()
可以看到，静态模式对样本内的拟合效果很出色，样本数据几乎可以被95%的置信区间覆盖，动态模式有点放飞。


start_date = '2020-02-28 12:00:00+08:00'
end_date = start_date

pred_dy = model_results.get_prediction(start=start_date, dynamic=True)
pred_dy_ci = pred_dy.conf_int()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(18, 6))
ax.plot(kline_all['log_return'].loc[start_date:], label='Log Return', linestyle='-')
ax.plot(pred_dy.predicted_mean.loc[start_date:], label='Forecast Log Return', linestyle='--')
ax.fill_between(pred_dy_ci.index,pred_dy_ci.iloc[:, 0],pred_dy_ci.iloc[:, 1], color='g', alpha=.25)

plt.ylabel("BTC Log Return")
plt.legend(loc='best')
plt.tight_layout()
sns.despine()
可以看到，两种模式对样本内的拟合效果都很出色，均值几乎都可以被95%的置信区间覆盖，但静态模式明显更贴合。接下来看对样本外的50步，即向前50小时的预测效果：

# 样本外的预测数据 predict()
start_date = '2020-03-01 12:00:00+08:00'
end_date = '2020-03-20 23:00:00+08:00'
model = False

predict_step = 50
predicts_ARIMA_normal = model_results.get_prediction(start=start_date, dynamic=model, full_reports=True)
ci_normal = predicts_ARIMA_normal.conf_int().loc[start_date:]

predicts_ARIMA_normal_out = model_results.get_forecast(steps=predict_step, dynamic=model)
ci_normal_out = predicts_ARIMA_normal_out.conf_int().loc[start_date:end_date]

fig, ax = plt.subplots(figsize=(18,8))
kline_test.loc[start_date:end_date, 'log_return'].plot(ax=ax, label='Benchmark Log Return')

predicts_ARIMA_normal.predicted_mean.plot(ax=ax, style='g', label='In Sample Forecast')
ax.fill_between(ci_normal.index, ci_normal.iloc[:,0], ci_normal.iloc[:,1], color='g', alpha=0.1)

predicts_ARIMA_normal_out.predicted_mean.loc[:end_date].plot(ax=ax, style='r--', label='Out of Sample Forecast')
ax.fill_between(ci_normal_out.index, ci_normal_out.iloc[:,0], ci_normal_out.iloc[:,1], color='r', alpha=0.1)

plt.tight_layout()
plt.legend(loc='best')
sns.despine()
由于样本内数据的拟合是滚动向前一步预测，所以在样本内信息量充足时，静态模式反而容易出现过度拟合，而动态模式缺少可靠的因变量，迭代多步之后效果越来越差。在对样本外数据进行预测时，模型等同于样本内的动态模式，所以长期预测的误差项准确度必然会较低。


params = (0, 1, 1)
mod = smt.SARIMAX(endog=kline_all['log_price'], trend='c', order=params, seasonal_order=(0, 0, 0, 0))
res = mod.fit(disp=False)

start_date = '2020-03-01 12:00:00+08:00'
end_date = '2020-03-15 23:00:00+08:00'

predicts_ARIMA_normal = res.get_prediction(start=start_date, dynamic=False, full_results=False)
predicts_ARIMA_dynamic = res.get_prediction(start=start_date, dynamic=True, full_results=False)

fig, ax = plt.subplots(figsize=(18,6))
kline_test.loc[start_date:end_date, 'log_price'].plot(ax=ax, label='Log Price')

predicts_ARIMA_normal.predicted_mean.loc[start_date:end_date].plot(ax=ax, style='r--', label='Normal Prediction')
ci_normal = predicts_ARIMA_normal.conf_int().loc[start_date:end_date]
ax.fill_between(ci_normal.index, ci_normal.iloc[:,0], ci_normal.iloc[:,1], color='r', alpha=0.1)

predicts_ARIMA_dynamic.predicted_mean.loc[start_date:end_date].plot(ax=ax, style='g', label='Dynamic Prediction')
ci_dynamic = predicts_ARIMA_dynamic.conf_int().loc[start_date:end_date]
ax.fill_between(ci_dynamic.index, ci_dynamic.iloc[:,0], ci_dynamic.iloc[:,1], color='g', alpha=0.1)

plt.tight_layout()
plt.legend(loc='best')
sns.despine()
图中很容易看出静态模式下的拟合优势，和动态模式在长期预测上的极差表现。这红色的虚线，粉红的区间····你不能说这模型的预测有误吧，毕竟完全覆盖了均线的走势，但····有意义吗？


start = '2020-02-14 00:00:00+08:00'
predicts_ARIMA_normal = model_results.get_prediction(dynamic=False)
predicts_ARIMA_dynamic = model_results.get_prediction(dynamic=True)
training_label = 'log_return'

compare_ARCH_X = pd.DataFrame()
compare_ARCH_X['Model'] = ['NORMAL','DYNAMIC']
compare_ARCH_X['RMSE'] = [rmse(predicts_ARIMA_normal.predicted_mean[1:], kline_test[training_label][:826]),
rmse(predicts_ARIMA_dynamic.predicted_mean[1:], kline_test[training_label][:826])]
compare_ARCH_X['MAPE'] = [mape(predicts_ARIMA_normal.predicted_mean[:50], kline_test[training_label][:50]),
mape(predicts_ARIMA_dynamic.predicted_mean[:50], kline_test[training_label][:50])]
compare_ARCH_X
Root Mean Square Error (RMSE): 0.0184
Root Mean Square Error (RMSE): 0.0167
Mean Absolute Percentage Error (MAPE): 2.25e+03
Mean Absolute Percentage Error (MAPE): 395

Model      RMSE         MAPE
0   NORMAL  0.018382  2249.756029
1  DYNAMIC  0.016694   395.233251
可以看到，静态模型在预测值与真实值的误差吻合度上，略优于动态模式，较好地拟合了比特币的对数收益率，基本符合预期。动态预测缺少更准确的变量信息，误差也被迭代放大，所以预测效果较差。MAPE都大于100%，所以两个模型的实际拟合品质都不理想。

predict_step = 50
predicts_ARIMA_normal_out = model_results.get_forecast(steps=predict_step, dynamic=False)
predicts_ARIMA_dynamic_out = model_results.get_forecast(steps=predict_step, dynamic=True)
testing_ts = kline_test
training_label = 'log_return'

compare_ARCH_X = pd.DataFrame()
compare_ARCH_X['Model'] = ['NORMAL','DYNAMIC']
compare_ARCH_X['RMSE'] = [get_rmse(predicts_ARIMA_normal_out.predicted_mean, testing_ts[training_label]),
get_rmse(predicts_ARIMA_dynamic_out.predicted_mean, testing_ts[training_label])]
compare_ARCH_X['MAPE'] = [get_mape(predicts_ARIMA_normal_out.predicted_mean, testing_ts[training_label]),
get_mape(predicts_ARIMA_dynamic_out.predicted_mean, testing_ts[training_label])]
compare_ARCH_X
     Model     RMSE         MAPE
0   NORMAL  0.01664  1399.597537
1  DYNAMIC  0.01664  1399.597537
由于样本外的下一步预测依赖上一步的结果，所以只有动态模式有效，而动态模式本身的长期误差缺陷，导致整体模型的预测能力不足，所以顶多也就预测下一步。


5、ARCH效应
ARCH模型效应，也就是条件异方差序列的序列相关性。利用混成检验Ljung-Box检验残差平方序列的相关性，来判断是否具有ARCH效应，如果ARCH效应检验通过，即序列具备异方差特性，就可以进行下一步的GARCH建模，对均值方程和波动率方程进行联合估计。否则需要对模型进行优化重调，例如再做差分处理或取倒数序列。


count_num = 100000
start_date = '2020-03-01'

df = get_bars('huobi.btc_usdt', '1m', count=count_num, start=start_date)  # 取分钟数据
kline_1m = pd.DataFrame(df['close'], dtype=np.float)
kline_1m.index.name = 'date'
kline_1m['log_price'] = np.log(kline_1m['close'])
kline_1m['return'] = kline_1m['close'].pct_change().dropna()
kline_1m['log_return'] = kline_1m['log_price'] - kline_1m['log_price'].shift(1)

kline_1m['squared_log_return'] = np.power(kline_1m['log_return'], 2)

kline_1m['return_100x'] = np.multiply(kline_1m['return'], 100)
kline_1m['log_return_100x'] = np.multiply(kline_1m['log_return'], 100)  # 放大100倍

df = get_bars('huobi.btc_usdt', '1h', count=count_num, start=start_date)  # 取小时数据
kline_test = pd.DataFrame(df['close'], dtype=np.float)
kline_test.index.name = 'date'
kline_test['log_price'] = np.log(kline_test['close'])  # 计算每日对数收益率
kline_test['return'] = kline_test['log_price'].pct_change().dropna()
kline_test['log_return'] = kline_test['log_price'] - kline_test['log_price'].shift(1)  # 计算对数收益率
kline_test['squared_log_return'] = np.power(kline_test['log_return'], 2)  # 对数日收益率的指数平方

kline_test['return_100x'] = np.multiply(kline_test['return'], 100)
kline_test['log_return_100x'] = np.multiply(kline_test['log_return'], 100)  # 放大100倍

kline_test['realized_variance_1_hour'] = kline_1m.loc[:, 'squared_log_return'].resample('h', closed='left', label='left').sum().copy() # 重采样至天
kline_test['realized_volatility_1_hour'] = np.sqrt(kline_test['realized_variance_1_hour'])  # 方差开方得波动率

kline_test = kline_test[4:-2500]
kline_test.head(3)
                             close  log_price    return  log_return  \
date
2020-03-01 12:00:00+08:00  8579.52   9.057133 -0.000518   -0.004694
2020-03-01 13:00:00+08:00  8586.87   9.057990  0.000095    0.000856
2020-03-01 14:00:00+08:00  8541.21   9.052658 -0.000589   -0.005332

squared_log_return  return_100x  log_return_100x  \
date
2020-03-01 12:00:00+08:00        2.203698e-05    -0.051804        -0.469436
2020-03-01 13:00:00+08:00        7.332917e-07     0.009455         0.085632
2020-03-01 14:00:00+08:00        2.842605e-05    -0.058861        -0.533161

realized_variance_1_hour  \
date
2020-03-01 12:00:00+08:00                  0.000030
2020-03-01 13:00:00+08:00                  0.000025
2020-03-01 14:00:00+08:00                  0.000029

realized_volatility_1_hour
date
2020-03-01 12:00:00+08:00                    0.005503
2020-03-01 13:00:00+08:00                    0.004993
2020-03-01 14:00:00+08:00                    0.005347  
cc = 3
model_p = 1
predict_lag = 30
label = 'log_return'

training_label = label
training_ts = pd.DataFrame(kline_test[training_label], dtype=np.float)

training_arch_label = label
training_arch = pd.DataFrame(kline_test[training_arch_label], dtype=np.float)

training_garch_label = label
training_garch = pd.DataFrame(kline_test[training_garch_label], dtype=np.float)

training_egarch_label = label
training_egarch = pd.DataFrame(kline_test[training_egarch_label], dtype=np.float)

training_arch.plot(figsize = (18,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7f5246829208>
<Figure size 1296x288 with 1 Axes>
对数收益率如上所示，接下来先需要检验样本的ARCH效应，基于ARMA建立样本内的残差序列，部分序列以及残差和残差的平方序列先算出来：

training_arma_model = smt.SARIMAX(endog=training_ts, trend='c', order=(0, 0, 1), seasonal_order=(0, 0, 0, 0))
arma_model_results = training_arma_model.fit(disp=False)
arma_model_results.summary()
training_arma_fitvalue = pd.DataFrame(arma_model_results.fittedvalues,dtype=np.float)
at = pd.merge(training_ts, training_arma_fitvalue, on='date')
at.columns = ['log_return', 'model_fit']
at['res'] = at['log_return'] - at['model_fit']
at['res2'] = np.square(at['res'])
at.head()
                           log_return  model_fit       res      res2
date
2020-02-13 13:00:00+08:00    0.003926  -0.000820  0.004747  0.000023
2020-02-13 14:00:00+08:00    0.001053  -0.001300  0.002353  0.000006
2020-02-13 15:00:00+08:00   -0.010148  -0.001060 -0.009088  0.000083
2020-02-13 16:00:00+08:00   -0.007774   0.000107 -0.007881  0.000062
2020-02-13 17:00:00+08:00   -0.009289  -0.000017 -0.009273  0.000086
然后绘制出样本的残差序列图

fig, ax = plt.subplots(figsize=(18, 6))
at['res'][1:].plot(ax=ax1,label='resid')
plt.legend(loc='best')

at['res2'][1:].plot(ax=ax2,label='resid2')

plt.legend(loc='best')
plt.tight_layout()
sns.despine()
可以看到，残差序列有明显的聚集特性，可以初步判断该序列具备ARCH效应，同样采取ACF来检验残差平方的自相关性，结果如下。

figure = plt.figure(figsize=(18,4))
fig = sm.graphics.tsa.plot_acf(at['res2'],lags = 40, ax=ax1)
对序列进行混成检验的原假设是序列没有相关性，可以看到前20阶的数据对应P值均小于0.05%置信水平临界值，所以拒绝原假设，即序列的残差具备ARCH效应，可以通过ARCH类模型进行方差模型的建立，用于拟合残差序列的异方差特性，并进一步预测波动率。
6、GARCH建模


am_GARCH = arch_model(training_garch, mean='AR', vol='GARCH',
p=1, q=1, lags=3, dist='ged')
res_GARCH = am_GARCH.fit(disp=False, options={'ftol': 1e-01})
res_GARCH.summary()
Iteration:      1,   Func. Count:     10,   Neg. LLF: -1917.4262154917305

<class 'statsmodels.iolib.summary.Summary'>
"""
AR - GARCH Model Results
==========================================================================================
Dep. Variable:                         log_return   R-squared:                       0.008
Mean Model:                                    AR   Adj. R-squared:                  0.003
Vol Model:                                  GARCH   Log-Likelihood:                1919.74
Distribution:      Generalized Error Distribution   AIC:                          -3823.47
Method:                        Maximum Likelihood   BIC:                          -3787.78
No. Observations:                  640
Date:                            Fri, Jul 10 2020   Df Residuals:                      632
Time:                                    02:12:46   Df Model:                            8
Mean Model
=================================================================================
coef    std err          t      P>|t|       95.0% Conf. Int.
---------------------------------------------------------------------------------
Const          3.1597e-04  3.114e-04      1.015      0.310 [-2.943e-04,9.262e-04]
log_return[1]     -0.0948  5.384e-02     -1.760  7.835e-02    [ -0.200,1.075e-02]
log_return[2]     -0.0250  4.659e-02     -0.537      0.591    [ -0.116,6.629e-02]
log_return[3] -6.9184e-03  3.512e-02     -0.197      0.844 [-7.575e-02,6.192e-02]
Volatility Model
============================================================================
coef    std err          t      P>|t|      95.0% Conf. Int.
----------------------------------------------------------------------------
omega      7.9633e-06  6.395e-11  1.245e+05      0.000 [7.963e-06,7.963e-06]
alpha[1]       0.1000  5.421e-02      1.845  6.508e-02  [-6.246e-03,  0.206]
beta[1]        0.8800  1.727e-02     50.945      0.000     [  0.846,  0.914]
Distribution
========================================================================
coef    std err          t      P>|t|  95.0% Conf. Int.
------------------------------------------------------------------------
nu             1.5000  1.184e-03   1267.280      0.000 [  1.498,  1.502]
========================================================================

Covariance estimator: robust
"""
根据ARCH库对GARCH波动率方程描述：
$$\sigma_{t}^{2}=\omega_{0}+\sum_{i=1}^{p} a_{1} \epsilon_{t-i}^{2}+\sum_{k=1}^{q} \beta_{1} \sigma_{t-k}^{2}$$

$$\sigma_{t}^{2}=0.0000071055+0.1 \epsilon_{t-1}^{2}+0.88 \sigma_{t-1}^{2}$$


def recursive_forecast(pd_dataframe):
window = predict_lag
model = 'GARCH'
index = kline_test[1:].index
end_loc = np.where(index >= kline_test.index[window])[0].min()
forecasts = {}
for i in range(len(kline_test[1:]) - window + 2):
mod = arch_model(pd_dataframe['log_return'][1:], mean='AR', vol=model, dist='ged',p=1, q=1)
res = mod.fit(last_obs=i+end_loc, disp='off', options={'ftol': 1e03})
temp = res.forecast().variance
fcast = temp.iloc[i + end_loc - 1]
forecasts[fcast.name] = fcast

forecasts = pd.DataFrame(forecasts).T
pd_dataframe['recursive_{}'.format(model)] = forecasts['h.1']
evaluate(pd_dataframe, 'realized_volatility_1_hour', 'recursive_{}'.format(model))

recursive_forecast(kline_test)
Mean Absolute Error (MAE): 0.0128
Mean Absolute Percentage Error (MAPE): 95.6
Root Mean Square Error (RMSE): 0.018

<Figure size 1296x360 with 1 Axes>
为了比较，顺手做个ARCH的，如下：

def recursive_forecast(pd_dataframe):
window = predict_lag
model = 'ARCH'
index = kline_test[1:].index
end_loc = np.where(index >= kline_test.index[window])[0].min()
forecasts = {}
for i in range(len(kline_test[1:]) - window + 2):
mod = arch_model(pd_dataframe['log_return'][1:], mean='AR', vol=model, dist='ged', p=1)
res = mod.fit(last_obs=i+end_loc, disp='off', options={'ftol': 1e03})
temp = res.forecast().variance
fcast = temp.iloc[i + end_loc - 1]
forecasts[fcast.name] = fcast

forecasts = pd.DataFrame(forecasts).T
pd_dataframe['recursive_{}'.format(model)] = forecasts['h.1']
evaluate(pd_dataframe, 'realized_volatility_1_hour', 'recursive_{}'.format(model))

recursive_forecast(kline_test)
Mean Absolute Error (MAE): 0.0136
Mean Absolute Percentage Error (MAPE): 98.1
Root Mean Square Error (RMSE): 0.02

<Figure size 1296x360 with 1 Axes>
7、EGARCH建模


am_EGARCH = arch_model(training_egarch, mean='AR', vol='EGARCH',
p=1, lags=3, o=1,q=1, dist='ged')
res_EGARCH = am_EGARCH.fit(disp=False, options={'ftol': 1e-01})
res_EGARCH.summary()
Iteration:      1,   Func. Count:     11,   Neg. LLF: -1966.610328148909

<class 'statsmodels.iolib.summary.Summary'>
"""
AR - EGARCH Model Results
==========================================================================================
Dep. Variable:                         log_return   R-squared:                       0.009
Mean Model:                                    AR   Adj. R-squared:                  0.004
Vol Model:                                 EGARCH   Log-Likelihood:                1967.53
Distribution:      Generalized Error Distribution   AIC:                          -3917.07
Method:                        Maximum Likelihood   BIC:                          -3876.91
No. Observations:                  640
Date:                            Fri, Jul 10 2020   Df Residuals:                      631
Time:                                    02:12:54   Df Model:                            9
Mean Model
=================================================================================
coef    std err          t      P>|t|       95.0% Conf. Int.
---------------------------------------------------------------------------------
Const         -7.6687e-05  3.108e-04     -0.247      0.805 [-6.858e-04,5.324e-04]
log_return[1]     -0.0948  6.888e-02     -1.376      0.169    [ -0.230,4.022e-02]
log_return[2]     -0.0250  7.064e-02     -0.354      0.723      [ -0.163,  0.113]
log_return[3] -6.9185e-03  5.236e-02     -0.132      0.895    [ -0.110,9.571e-02]
Volatility Model
==========================================================================
coef    std err          t      P>|t|    95.0% Conf. Int.
--------------------------------------------------------------------------
omega         -0.1566      0.341     -0.459      0.646   [ -0.825,  0.512]
alpha[1]       0.2000      0.266      0.752      0.452   [ -0.321,  0.721]
gamma[1]      -0.1000  8.486e-02     -1.178      0.239 [ -0.266,6.631e-02]
beta[1]        0.9800  4.218e-02     23.232 2.175e-119   [  0.897,  1.063]
Distribution
========================================================================
coef    std err          t      P>|t|  95.0% Conf. Int.
------------------------------------------------------------------------
nu             1.5000      0.353      4.244  2.200e-05 [  0.807,  2.193]
========================================================================

Covariance estimator: robust
"""
ARCH库提供的EGARCH波动率方程描述如下：
$$\ln ^{\sigma_{l}^{2}}=\omega_{0}+\sum_{i=1}^{p} a_{1}\left(\left|e_{t-i}\right|-\sqrt{2 / \pi}\right)+\sum_{j=1}^{o} \gamma_{j} e_{t-j}+\sum_{k=1}^{q} \beta_{k} \ln ^{\sigma_{t-k}^{2}}$$

$$\ln ^{\sigma_{t}^{2}}=-0.159799+0.100011\left(\left.\right|^{\varepsilon_{t-1}} / \sigma_{t-1} \mid-\sqrt{2 / \pi}\right)-0.099991^{\varepsilon_{t-1}} / \sigma_{t-1}+0.98 \ln ^{\sigma_{t-1}^{2}}$$


def recursive_forecast(pd_dataframe):
window = 280
model = 'EGARCH'
index = kline_test[1:].index
end_loc = np.where(index >= kline_test.index[window])[0].min()
forecasts = {}
for i in range(len(kline_test[1:]) - window + 2):
mod = arch_model(pd_dataframe['log_return'][1:], mean='AR', vol=model,
lags=3, p=2, o=0, q=1, dist='ged')
res = mod.fit(last_obs=i+end_loc, disp='off', options={'ftol': 1e03})
temp = res.forecast().variance
fcast = temp.iloc[i + end_loc - 1]
forecasts[fcast.name] = fcast

forecasts = pd.DataFrame(forecasts).T
pd_dataframe['recursive_{}'.format(model)] = forecasts['h.1']
evaluate(pd_dataframe, 'realized_volatility_1_hour', 'recursive_{}'.format(model))
pd_dataframe['recursive_{}'.format(model)]

recursive_forecast(kline_test)
Mean Absolute Error (MAE): 0.0201
Mean Absolute Percentage Error (MAPE): 122
Root Mean Square Error (RMSE): 0.0279

<Figure size 1296x360 with 1 Axes>
可以看到，EGARCH相比较ARCH和GARCH对波动更敏感，对波动率的拟合更出色。
8、波动率预测的评价


compare_ARCH_X = pd.DataFrame()
compare_ARCH_X['original']=kline_test['realized_volatility_1_hour']

compare_ARCH_X['arch']=kline_test['recursive_ARCH']
compare_ARCH_X['arch_diff']=compare_ARCH_X['original']-np.abs(compare_ARCH_X['arch'])

compare_ARCH_X['garch']=kline_test['recursive_GARCH']
compare_ARCH_X['garch_diff']=compare_ARCH_X['original']-np.abs(compare_ARCH_X['garch'])

compare_ARCH_X['egarch']=kline_test['recursive_EGARCH']
compare_ARCH_X['egarch_diff']=compare_ARCH_X['original']-np.abs(compare_ARCH_X['egarch'])
compare_ARCH_X = compare_ARCH_X[280:]
compare_ARCH_X.head(10)
                           original      arch  arch_diff     garch  \
date
2020-03-13 04:00:00+08:00  0.039796  0.000410   0.039386  0.019046
2020-03-13 05:00:00+08:00  0.037765  0.000173   0.037591  0.003011
2020-03-13 06:00:00+08:00  0.034429  0.000156   0.034273  0.002956
2020-03-13 07:00:00+08:00  0.133740  0.004234   0.129506  0.034791
2020-03-13 08:00:00+08:00  0.070220  0.000394   0.069825  0.190576
2020-03-13 09:00:00+08:00  0.064249  0.006472   0.057777  0.066352
2020-03-13 10:00:00+08:00  0.177593  0.005037   0.172556  0.102359
2020-03-13 11:00:00+08:00  0.081091  0.001535   0.079556  0.022981
2020-03-13 12:00:00+08:00  0.059117  0.005636   0.053481  0.008225
2020-03-13 13:00:00+08:00  0.046963  0.000966   0.045997  0.047651

garch_diff    egarch  egarch_diff
date
2020-03-13 04:00:00+08:00    0.020750  0.000354     0.039442
2020-03-13 05:00:00+08:00    0.034754  0.000401     0.037364
2020-03-13 06:00:00+08:00    0.031473  0.000340     0.034089
2020-03-13 07:00:00+08:00    0.098949  0.091072     0.042668
2020-03-13 08:00:00+08:00   -0.120357  0.069111     0.001109
2020-03-13 09:00:00+08:00   -0.002103  0.011460     0.052789
2020-03-13 10:00:00+08:00    0.075234  0.130532     0.047061
2020-03-13 11:00:00+08:00    0.058110  0.074742     0.006349
2020-03-13 12:00:00+08:00    0.050892  0.212941    -0.153824
2020-03-13 13:00:00+08:00   -0.000689  0.176024    -0.129062  
compare_ARCH_X_diff = pd.DataFrame(index=['ARCH','GARCH','EGARCH'], columns=['head 1 step', 'head 10 steps', 'head 100 steps'])
compare_ARCH_X_diff['head 1 step']['ARCH'] = compare_ARCH_X['arch_diff']['2020-03-13 04:00:00+08:00']
compare_ARCH_X_diff['head 1 step']['GARCH'] = compare_ARCH_X['garch_diff']['2020-03-13 04:00:00+08:00']
compare_ARCH_X_diff['head 1 step']['EGARCH'] = compare_ARCH_X['egarch_diff']['2020-03-13 04:00:00+08:00']
compare_ARCH_X_diff
       head 1 step head 10 steps head 100 steps
ARCH     0.0393856     0.0719949       0.028944
GARCH      0.02075     0.0247013      0.0218469
EGARCH   0.0394424   -0.00220146     0.00703015
做了几次试验，在前1个小时预测结果里，EGARCH的误差最小的几率比较大，但整体差异不是特别明显；短期预测效果里有一定明显区别；在长期预测里， EGARCH的预测能力最突出

compare_ARCH_X = pd.DataFrame()
compare_ARCH_X['Model'] = ['ARCH','GARCH','EGARCH']
compare_ARCH_X['RMSE'] = [get_rmse(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_ARCH'][280:320]),
get_rmse(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_GARCH'][280:320]),
get_rmse(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_EGARCH'][280:320])]
compare_ARCH_X['MAPE'] = [get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_ARCH'][280:320]),
get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_GARCH'][280:320]),
get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_EGARCH'][280:320])]
compare_ARCH_X['MASE'] = [get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_ARCH'][280:320]),
get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_GARCH'][280:320]),
get_mape(kline_test['realized_volatility_1_hour'][280:320],kline_test['recursive_EGARCH'][280:320])]

compare_ARCH_X
    Model      RMSE       MAPE       MASE
0    ARCH  0.051855  97.772640  97.772640
1   GARCH  0.041761  82.338067  82.338067
2  EGARCH  0.044885  82.598753  82.598753
从指标上看，GARCH和EGARCH相对ARCH来说有一定提升，但差距也不是特别明显。在多样本区间选择和验证后，EGARCH的性能会更优一些，这也是主要因为EGARCH很好地解释了样本所呈现出的异方差特性。
9、结论



More

syue 妹子是不是娶了你就可以不用学量化了 啊哈哈啊

kevin1111 66666666666666

YingHN 厉害了

dsaidasi 好专业的分析。。。