本文共 5265 字,大约阅读时间需要 17 分钟。
R语言选模型/用AIC BIC adjustRsq 十折交叉验证 LOOCV等验证/择参 以fama三因子模型和CAMP模型为例@
install.packages("leaps") install.packages("car") install.packages("caret") library("car")library("leaps") library("heavy") library("caret")mt=read.csv("XIXI-Data.csv")mt$X=as.Date(mt$X,"%Y/%m/%d") #修改日期格式并赋值给X列ff=read.csv("F-F_Research_Data_Factors_daily.csv",header=T,skip=4+ )#引入三因子信息dat=merge(mt,ff,by= "X") #和另一个数据集按照日期进行拼接,如果本来就是一整个数据集,这两部可以忽略attach(dat)#方便后面写语句
fama三因子模型的公式是
下面是基本介绍: Here, Rt, RM,t and µt are the net returns of the asset, the market portfolio and the risk-free asset from t−1 to t, respectively. The market excess return is the first risk factor, while the second and the third ones are SMB (small minus big) and HML (high minus low). To be more specific, SMB is the difference in returns on a portfolio of small stocks and a portfolio of large stocks. “Small” and “big” refer to the size of the market value of a stock. HML is the difference in returns on a portfolio of high book-to-market value stocks and a portfolio of low book-tomarket value stocks. SMBt and HMLt are the differences in returns in the t-th period given by (t−1,t].> n=dim(dat)[1] #计算一列有多少数> R_g=Adj.Close[-1]/Adj.Close[-n] #后一项除以前一项算回报率> R_n=R_g-1 #净回报率> lef_term=(R_n-RF[-1]) #三因子模型的左边> reg1=lm(lef_term~Mkt.RF[-1]+SMB[-1]+HML[-1])#回归> vif(reg1)#计算vif > #下面跟一段VIF的解释> # The VIF for Mkt.RF is 1.056693. It means its squared standard error is 1.056693 times larger than it would be if the other predictors are deleted or were not correlated with it. The same as the other 2 elements. The VIF of these 3 elements proved that they estimate their coefficient doesn’t have large errors which means there is not a collinearity problem.> summary(reg1)Call:lm(formula = lef_term ~ Mkt.RF[-1] + SMB[-1] + HML[-1])Residuals: Min 1Q Median 3Q Max -0.049696 -0.005057 0.000259 0.005244 0.042138 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.0003031 0.0003389 0.894 0.371 Mkt.RF[-1] 0.7387009 0.0417026 17.714 < 2e-16 ***SMB[-1] -0.4195117 0.0697805 -6.012 2.87e-09 ***HML[-1] 0.0645699 0.0617378 1.046 0.296 ---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1Residual standard error: 0.009249 on 748 degrees of freedomMultiple R-squared: 0.3052, Adjusted R-squared: 0.3024 F-statistic: 109.5 on 3 and 748 DF, p-value: < 2.2e-16
> CAPM_reg1=lm(lef_term~Mkt.RF[-1])> summary(CAPM_reg1)Call:lm(formula = lef_term ~ Mkt.RF[-1])Residuals: Min 1Q Median 3Q Max -0.049022 -0.004929 0.000278 0.005352 0.045007 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.0003638 0.0003458 1.052 0.293 Mkt.RF[-1] 0.6924356 0.0415120 16.680 <2e-16 ***---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1Residual standard error: 0.009464 on 750 degrees of freedomMultiple R-squared: 0.2706, Adjusted R-squared: 0.2696 F-statistic: 278.2 on 1 and 750 DF, p-value: < 2.2e-16> BIC(reg1)[1] -4880.385> BIC(CAPM_reg1)[1] -4857.045#BIC越小越好,summary里有调整的R方信息,R方越接近1越好#十折交叉验证> reg_dif =data.frame(lef_term,Mkt.RF[-1],SMB[-1],HML[-1])> CAPM_dif=data.frame(lef_term,Mkt.RF[-1])> train.control=trainControl(method="cv",number=10)> set.seed(1)> reg_cv=train(lef_term~.,data=reg_dif,method="lm", trControl=train.control)> set.seed(1)> CAPM_reg_cv=train(lef_term~., data = CAPM_dif,method="lm", trControl=train.control)> TestMSE_FF=as.numeric(reg_cv$results[2])^2> TestMSE_CAPM=as.numeric(CAPM_reg_cv$results[2])^2> TestMSE_FF[1] 8.486705e-05> TestMSE_CAPM[1] 8.867714e-05#test error 越小越好#LOOCV交叉验证> train.control=trainControl(method="LOOCV")> set.seed(1)> reg_cv2=train(lef_term~.,data=reg_dif,method="lm", trControl=train.control) > set.seed(1)> CAPM_reg_cv2=train(lef_term~., data = CAPM_dif,method="lm", trControl=train.control)> TestMSE_FF2=as.numeric(reg_cv2$results[2])^2> TestMSE_CAPM2=as.numeric(CAPM_reg_cv2$results[2])^2> TestMSE_FF2[1] 8.617911e-05> TestMSE_CAPM2[1] 8.984655e-05#test error 越小越好#上面都证明了要选fama三因子,拟合好
> regsub=regsubsets(lef_term~.,data=reg_dif) #择参函数> regsub_sum=summary(regsub)> regsub_sum$which #可以看出穷举的留多少参数时选哪个参 (Intercept) Mkt.RF..1. SMB..1. HML..1.1 TRUE TRUE FALSE FALSE2 TRUE TRUE TRUE FALSE3 TRUE TRUE TRUE TRUE> plot(regsub_sum$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type = "l")#看参数数量如何影响调节R方> plot(regsub_sum$cp,xlab="Number of Variables",ylab="Cp",type = "l")> plot(regsub_sum$bic,xlab="Number of Variables",ylab="BIC",type = "l")> #十折交叉验证和LOOCV验证> train.control=trainControl(method="cv",number=10)> set.seed(1)> dif1=data.frame(lef_term,Mkt.RF[-1])> sub_reg_cv1=train(lef_term~., data = dif1,method="lm", trControl=train.control)> M1=as.numeric(sub_reg_cv1$results[2])^2> set.seed(1)> dif2=data.frame(lef_term,Mkt.RF[-1],SMB[-1])> sub_reg_cv2=train(lef_term~., data = dif2,method="lm", trControl=train.control)> M2=as.numeric(sub_reg_cv2$results[2])^2> set.seed(1)> dif3=data.frame(lef_term,Mkt.RF[-1],SMB[-1],HML[-1])> sub_reg_cv3=train(lef_term~., data = dif3,method="lm", trControl=train.control)> M3=as.numeric(sub_reg_cv3$results[2])^2> M1[1] 8.867714e-05> M2[1] 8.451847e-05> M3[1] 8.486705e-05
转载地址:http://ypoen.baihongyu.com/