library(tidyverse)
library(caret)
library(MASS)
library(glmnet)
source("./fonctionsTP2.R")
claims = read_csv("../Data Claims/freMTPL2freq.csv",progress = FALSE)
Parsed with column specification:
cols(
IDpol = col_double(),
ClaimNb = col_double(),
Exposure = col_double(),
Area = col_character(),
VehPower = col_double(),
VehAge = col_double(),
DrivAge = col_double(),
BonusMalus = col_double(),
VehBrand = col_character(),
VehGas = col_character(),
Density = col_double(),
Region = col_character()
)
glimpse(claims)
Observations: 678,013
Variables: 12
$ IDpol <dbl> 1, 3, 5, 10, 11, 13, 15, 17, 18, 21, 25, 27, 30, 32, 35, 36, 38, 42, 44, 45, 47, 49, 50, 52, 53...
$ ClaimNb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,...
$ Exposure <dbl> 0.10, 0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15, 0.75, 0.87, 0.81, 0.05, 0.76, 0.34,...
$ Area <chr> "'D'", "'D'", "'B'", "'B'", "'B'", "'E'", "'E'", "'C'", "'C'", "'B'", "'B'", "'C'", "'D'", "'D'...
$ VehPower <dbl> 5, 5, 6, 7, 7, 6, 6, 7, 7, 7, 7, 7, 4, 4, 4, 9, 6, 6, 6, 6, 6, 7, 7, 6, 5, 5, 5, 5, 5, 5, 5, 15...
$ VehAge <dbl> 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 9, 0, 2, 2, 2, 2, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,...
$ DrivAge <dbl> 55, 55, 52, 46, 46, 38, 38, 33, 33, 41, 41, 56, 27, 27, 23, 44, 32, 32, 55, 55, 55, 73, 73, 27,...
$ BonusMalus <dbl> 50, 50, 50, 50, 50, 50, 50, 68, 68, 50, 50, 50, 90, 90, 100, 76, 56, 56, 50, 50, 50, 50, 50, 76...
$ VehBrand <chr> "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12...
$ VehGas <chr> "'Regular'", "'Regular'", "'Diesel'", "'Diesel'", "'Diesel'", "'Regular'", "'Regular'", "'Diese...
$ Density <dbl> 1217, 1217, 54, 76, 76, 3003, 3003, 137, 137, 60, 60, 173, 695, 695, 7887, 27000, 23, 23, 37, 3...
$ Region <chr> "'R82'", "'R82'", "'R22'", "'R72'", "'R72'", "'R31'", "'R31'", "'R91'", "'R91'", "'R52'", "'R52...
On laisse pour l’instant la variable VehPower
inchangée.
claims = claims%>%
mutate(VehBrand = factor(VehBrand )) %>%
mutate(VehGas = factor(VehGas)) %>%
mutate(Region = factor(Region)) %>%
mutate(Frequency = ClaimNb / Exposure)
set.seed(123)
nombre_small = 100000
smallIndex = sample(claims$IDpol, size = nombre_small)
claims_small = claims %>% filter(IDpol %in% smallIndex)
rm(claims)
Valeurs manquantes
claims_small %>% summarise_all(funs(sum(is.na(.))))
Distribution de ClaimNb
ggplot(claims_small,aes(ClaimNb)) + geom_bar() + scale_y_log10()
Distribution de Exposure
ggplot(claims_small,aes(Exposure,after_stat(density),)) + geom_histogram()
Distribution de Frequency
ggplot(claims_small,aes(Frequency)) + geom_histogram() + scale_y_log10()
print(paste0("Proportion d'observations sans claim= ",round(mean(claims_small$ClaimNb==0),digits = 4)))
[1] "Proportion d'observations sans claim= 0.9496"
print(paste0("Moyenne du nombre de claim= ",round(mean(claims_small$ClaimNb,na.rm = T),digits = 4)))
[1] "Moyenne du nombre de claim= 0.0538"
print(paste(c("Probabilite pour une loi de Poisson de moyenne ",
round(mean(claims_small$ClaimNb,na.rm = T),digits = 3),
" d'observer 0 est ",
round(exp(-mean(claims_small$ClaimNb,na.rm = T)),digits =3)),
collapse = ""))
[1] "Probabilite pour une loi de Poisson de moyenne 0.054 d'observer 0 est 0.948"
Lien de Frequency
avec les autres variables
ggplot(claims_small,aes(Area,Frequency)) + geom_boxplot() + scale_y_log10()
ggplot(claims_small,aes(VehPower,Frequency)) + geom_point() + scale_y_log10()
ggplot(claims_small,aes(VehAge,Frequency)) + geom_point() + scale_y_log10()
ggplot(claims_small,aes(DrivAge,Frequency)) + geom_point() + scale_y_log10()
ggplot(claims_small,aes(BonusMalus,Frequency)) + geom_point() + scale_y_log10()
ggplot(claims_small,aes(VehBrand,Frequency)) + geom_boxplot() + scale_y_log10()
ggplot(claims_small,aes(VehGas,Frequency)) + geom_boxplot() + scale_y_log10()
ggplot(claims_small,aes(Density,Frequency)) + geom_point() + scale_y_log10()
ggplot(claims_small,aes(Density,Frequency)) + geom_point() + scale_y_log10() + scale_x_log10()
claims_small = claims_small %>% mutate(LogDensity = log(Density))
ggplot(claims_small,aes(Region,Frequency)) + geom_boxplot() + scale_y_log10()
On prend le log de la variable Exposure
claims_small = claims_small %>% mutate(LogExposure = log(Exposure))
claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
set.seed(432)
trainIndex<- createDataPartition(claims_small$IDpol, p = 0.8,
list = FALSE)
length(trainIndex)
[1] 80000
claims_small_train = claims_small[trainIndex,]
claims_small_test = claims_small[-trainIndex,]
glimpse(claims_small_train)
Observations: 80,000
Variables: 16
$ IDpol <dbl> 59, 67, 68, 78, 92, 108, 121, 145, 167, 209, 212, 246, 256, 286, 295, 299, 307, 311, 325, 33...
$ ClaimNb <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,...
$ Exposure <dbl> 0.79, 0.80, 0.07, 0.16, 0.41, 0.02, 0.05, 0.81, 0.77, 0.34, 0.14, 0.08, 0.85, 0.75, 0.04, 0....
$ Area <chr> "'C'", "'D'", "'D'", "'A'", "'B'", "'C'", "'C'", "'A'", "'E'", "'D'", "'E'", "'A'", "'E'", "...
$ VehPower <dbl> 5, 5, 5, 6, 4, 9, 6, 6, 4, 4, 10, 7, 6, 5, 5, 4, 6, 6, 5, 4, 4, 7, 9, 9, 9, 5, 4, 7, 10, 11,...
$ VehAge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 4, 0, 3, 6, 0, 0, 0, 0, 0, 0, 1, 9, 8, 0, 0, 0, 7, 4, 10...
$ DrivAge <dbl> 59, 69, 69, 60, 30, 45, 37, 62, 55, 48, 32, 38, 35, 41, 29, 26, 50, 58, 35, 52, 29, 32, 46, ...
$ BonusMalus <dbl> 50, 52, 52, 51, 80, 50, 55, 90, 50, 50, 50, 60, 68, 90, 60, 100, 50, 50, 50, 90, 76, 58, 50,...
$ VehBrand <fct> 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B5', 'B...
$ VehGas <fct> 'Regular', 'Regular', 'Regular', 'Diesel', 'Regular', 'Regular', 'Diesel', 'Regular', 'Regul...
$ Density <dbl> 455, 1376, 1376, 12, 77, 120, 303, 49, 2715, 1622, 2757, 27, 9307, 27000, 63, 2308, 8880, 23...
$ Region <fct> 'R91', 'R11', 'R11', 'R83', 'R93', 'R83', 'R11', 'R53', 'R93', 'R91', 'R22', 'R11', 'R82', '...
$ Frequency <dbl> 1.265823, 1.250000, 14.285714, 6.250000, 2.439024, 50.000000, 20.000000, 1.234568, 1.298701,...
$ LogDensity <dbl> 6.120297, 7.226936, 7.226936, 2.484907, 4.343805, 4.787492, 5.713733, 3.891820, 7.906547, 7....
$ LogExposure <dbl> -0.2357223, -0.2231436, -2.6592600, -1.8325815, -0.8915981, -3.9120230, -2.9957323, -0.21072...
$ Claim_binaire <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
model_full_offset = glm(ClaimNb ~ offset(LogExposure) + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
+ VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full_offset)
Call:
glm(formula = ClaimNb ~ offset(LogExposure) + Area + VehPower +
VehAge + DrivAge + BonusMalus + VehBrand + VehGas + LogDensity,
family = poisson(), data = claims_small_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.2187 -0.3888 -0.2974 -0.1626 13.3317
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.9446153 0.1625753 -24.263 < 2e-16 ***
Area'B' 0.0165116 0.0738016 0.224 0.822968
Area'C' 0.1205516 0.0946131 1.274 0.202609
Area'D' 0.2774197 0.1433999 1.935 0.053041 .
Area'E' 0.3271010 0.1907226 1.715 0.086334 .
Area'F' 0.3693692 0.2609347 1.416 0.156904
VehPower 0.0318559 0.0078226 4.072 4.66e-05 ***
VehAge -0.0381663 0.0033650 -11.342 < 2e-16 ***
DrivAge 0.0051649 0.0011433 4.518 6.25e-06 ***
BonusMalus 0.0226753 0.0008881 25.532 < 2e-16 ***
VehBrand'B10' 0.1029516 0.0995720 1.034 0.301163
VehBrand'B11' -0.1712715 0.1248635 -1.372 0.170166
VehBrand'B12' 0.1649006 0.0481364 3.426 0.000613 ***
VehBrand'B13' -0.0106295 0.1210972 -0.088 0.930054
VehBrand'B14' -0.5372948 0.2692375 -1.996 0.045976 *
VehBrand'B2' 0.0387875 0.0442934 0.876 0.381196
VehBrand'B3' 0.0024469 0.0626301 0.039 0.968835
VehBrand'B4' -0.1354487 0.0901096 -1.503 0.132799
VehBrand'B5' 0.1144411 0.0707999 1.616 0.106007
VehBrand'B6' -0.0495408 0.0818699 -0.605 0.545102
VehGas'Regular' 0.0715168 0.0315835 2.264 0.023551 *
LogDensity -0.0219868 0.0358149 -0.614 0.539280
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 26657 on 79999 degrees of freedom
Residual deviance: 25727 on 79978 degrees of freedom
AIC: 34069
Number of Fisher Scoring iterations: 6
y_pred_train_offset = predict(model_full_offset,type="response")
y_true_train = claims_small_train$ClaimNb
y_pred_test_offset = predict(model_full_offset,type="response",claims_small_test)
y_true_test = claims_small_test$ClaimNb
errors_full_offset = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
print(errors_full_offset)
$logMSE_train
[1] 0.02530934
$logMSE_test
[1] 0.02477317
$MAPE_train
[1] 0.0748177
$MAPE_test
[1] 0.07426603
model_full = glm(ClaimNb ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
+ VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full)
Call:
glm(formula = ClaimNb ~ LogExposure + Area + VehPower + VehAge +
DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = poisson(),
data = claims_small_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9435 -0.3637 -0.3131 -0.2478 12.7784
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.1728794 0.1639109 -25.458 < 2e-16 ***
LogExposure 0.4222318 0.0180774 23.357 < 2e-16 ***
Area'B' 0.0092183 0.0738301 0.125 0.90064
Area'C' 0.0838926 0.0946919 0.886 0.37564
Area'D' 0.2162977 0.1435014 1.507 0.13174
Area'E' 0.2392389 0.1908972 1.253 0.21012
Area'F' 0.2928437 0.2610425 1.122 0.26194
VehPower 0.0279580 0.0077867 3.590 0.00033 ***
VehAge -0.0339298 0.0033316 -10.184 < 2e-16 ***
DrivAge 0.0079835 0.0011560 6.906 4.97e-12 ***
BonusMalus 0.0201948 0.0009105 22.180 < 2e-16 ***
VehBrand'B10' 0.0843090 0.0994848 0.847 0.39674
VehBrand'B11' -0.2077645 0.1248665 -1.664 0.09613 .
VehBrand'B12' -0.0093186 0.0485502 -0.192 0.84779
VehBrand'B13' -0.0188085 0.1210689 -0.155 0.87654
VehBrand'B14' -0.5587437 0.2692469 -2.075 0.03797 *
VehBrand'B2' 0.0417314 0.0442846 0.942 0.34602
VehBrand'B3' -0.0071772 0.0626378 -0.115 0.90878
VehBrand'B4' -0.1372335 0.0901053 -1.523 0.12775
VehBrand'B5' 0.1143009 0.0707897 1.615 0.10639
VehBrand'B6' -0.0543124 0.0818610 -0.663 0.50703
VehGas'Regular' 0.0992080 0.0316000 3.139 0.00169 **
LogDensity -0.0196519 0.0358497 -0.548 0.58357
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 26053 on 79999 degrees of freedom
Residual deviance: 24928 on 79977 degrees of freedom
AIC: 33272
Number of Fisher Scoring iterations: 6
y_pred_train = predict(model_full,type="response")
y_pred_test = predict(model_full,type="response",claims_small_test)
errors_full = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
errors_all = rbind(errors_full_offset,errors_full)
print(errors_all)
logMSE_train logMSE_test MAPE_train MAPE_test
errors_full_offset 0.02530934 0.02477317 0.0748177 0.07426603
errors_full 0.02530934 0.02477317 0.0748177 0.07426603
model_full_aic = stepAIC(model_full,trace = F)
Il faut sauvegarder le modèle aic et le recharger ensuite …
y_pred_train_aic = predict(model_full_aic,type="response")
y_pred_test_aic = predict(model_full_aic,type="response",claims_small_test)
errors_aic = errors(y_pred_train_aic,y_pred_test_aic,y_true_train,y_true_test)
errors_all = rbind(errors_all,errors_aic)
print(errors_all)
logMSE_train logMSE_test MAPE_train MAPE_test
errors_full_offset 0.02530934 0.02477317 0.0748177 0.07426603
errors_full 0.02530934 0.02477317 0.0748177 0.07426603
errors_aic 0.02531901 0.0247661 0.0748456 0.07424057
#claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
model_full_classif = glm(Claim_binaire ~ Exposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
+ VehGas + LogDensity, data =claims_small_train,family = binomial())
summary(model_full_classif)
Call:
glm(formula = Claim_binaire ~ Exposure + Area + VehPower + VehAge +
DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = binomial(),
data = claims_small_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9274 -0.3609 -0.2990 -0.2404 3.0512
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.373227 0.180486 -29.771 < 2e-16 ***
Exposure 1.250664 0.050168 24.929 < 2e-16 ***
Area'B' 0.004060 0.078189 0.052 0.958586
Area'C' 0.100161 0.100610 0.996 0.319479
Area'D' 0.247304 0.152701 1.620 0.105332
Area'E' 0.293158 0.203247 1.442 0.149198
Area'F' 0.373741 0.278195 1.343 0.179126
VehPower 0.030679 0.008327 3.684 0.000229 ***
VehAge -0.036066 0.003547 -10.169 < 2e-16 ***
DrivAge 0.007982 0.001248 6.397 1.59e-10 ***
BonusMalus 0.021859 0.001028 21.262 < 2e-16 ***
VehBrand'B10' 0.139344 0.105170 1.325 0.185189
VehBrand'B11' -0.183027 0.131488 -1.392 0.163933
VehBrand'B12' 0.051637 0.052504 0.983 0.325367
VehBrand'B13' 0.017586 0.127955 0.137 0.890682
VehBrand'B14' -0.503501 0.274834 -1.832 0.066949 .
VehBrand'B2' 0.052407 0.047265 1.109 0.267525
VehBrand'B3' 0.019282 0.066669 0.289 0.772413
VehBrand'B4' -0.124121 0.095369 -1.301 0.193094
VehBrand'B5' 0.144247 0.075604 1.908 0.056398 .
VehBrand'B6' -0.027815 0.086742 -0.321 0.748469
VehGas'Regular' 0.097741 0.033720 2.899 0.003749 **
LogDensity -0.029704 0.038172 -0.778 0.436467
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32191 on 79999 degrees of freedom
Residual deviance: 31127 on 79977 degrees of freedom
AIC: 31173
Number of Fisher Scoring iterations: 6
print(dim(claims_small_train))
[1] 80000 16
claims_small_train_hurdle = claims_small_train %>% filter(ClaimNb>=1)
print(dim(claims_small_train_hurdle))
[1] 4073 16
model_full_hurdle = glm(ClaimNb-1 ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
+ VehGas + LogDensity, data =claims_small_train_hurdle,family = poisson())
summary(model_full_hurdle)
Call:
glm(formula = ClaimNb - 1 ~ LogExposure + Area + VehPower + VehAge +
DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = poisson(),
data = claims_small_train_hurdle)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.7510 -0.3827 -0.3366 -0.2934 11.7475
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.050e+00 6.663e-01 -6.079 1.21e-09 ***
LogExposure 1.520e-01 7.692e-02 1.975 0.0482 *
Area'B' 1.524e-01 3.255e-01 0.468 0.6397
Area'C' 9.649e-02 4.046e-01 0.238 0.8115
Area'D' 1.080e-01 5.967e-01 0.181 0.8564
Area'E' -9.329e-02 7.889e-01 -0.118 0.9059
Area'F' -3.278e-01 1.076e+00 -0.305 0.7607
VehPower 2.209e-03 3.285e-02 0.067 0.9464
VehAge -4.139e-03 1.319e-02 -0.314 0.7536
DrivAge -7.087e-04 4.706e-03 -0.151 0.8803
BonusMalus 1.352e-02 3.080e-03 4.390 1.13e-05 ***
VehBrand'B10' -6.769e-01 5.246e-01 -1.290 0.1969
VehBrand'B11' -4.792e-01 5.956e-01 -0.805 0.4210
VehBrand'B12' 1.960e-01 1.958e-01 1.001 0.3166
VehBrand'B13' -5.101e-01 5.939e-01 -0.859 0.3904
VehBrand'B14' -1.255e+01 3.351e+02 -0.037 0.9701
VehBrand'B2' -1.405e-01 1.791e-01 -0.785 0.4326
VehBrand'B3' -3.053e-01 2.671e-01 -1.143 0.2531
VehBrand'B4' -3.240e-01 3.981e-01 -0.814 0.4158
VehBrand'B5' -2.791e-01 3.037e-01 -0.919 0.3580
VehBrand'B6' -4.392e-01 3.755e-01 -1.170 0.2421
VehGas'Regular' -3.683e-02 1.294e-01 -0.285 0.7759
LogDensity 9.837e-02 1.482e-01 0.664 0.5068
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1543.2 on 4072 degrees of freedom
Residual deviance: 1499.5 on 4050 degrees of freedom
AIC: 2028.2
Number of Fisher Scoring iterations: 13
pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test)
errors_hurdle = pred$errors_hurdle
errors_hurdle01 = pred$errors_hurdle01
errors_all = rbind(errors_all,errors_hurdle)
errors_all = rbind(errors_all,errors_hurdle01)
print(errors_all)
logMSE_train logMSE_test MAPE_train MAPE_test
errors_full_offset 0.02530934 0.02477317 0.0748177 0.07426603
errors_full 0.02530934 0.02477317 0.0748177 0.07426603
errors_aic 0.02531901 0.0247661 0.0748456 0.07424057
errors_full_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_full_hurdle2 0.02676755 0.02596599 0.0259533 0.02484583
errors_full_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_full_hurdle2 0.2270736 0.230549 0.4502807 0.4576359
0.02676755 0.02596599 0.0259533 0.02484583
0.02530432 0.02475668 0.07476975 0.07414931
0.02676755 0.02596599 0.0259533 0.02484583
errors_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_hurdle01 0.02676755 0.02596599 0.0259533 0.02484583
A changer en utilisant
data_plot_train = tibble(predictions = y_pred_full_hurdle_train ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train)
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
data_plot_test = tibble(predictions = y_pred_full_hurdle_test,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test)
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
data_plot_train = tibble(predictions = y_pred_full_hurdle_train2 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train)
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
#print(c(mean(y_pred_full_hurdle_train>0),mean(y_pred_full_hurdle_test>0)))
print(c(mean(y_pred_full_hurdle_train2>0),mean(y_pred_full_hurdle_test2>0)))
data_plot_test = tibble(predictions = y_pred_full_hurdle_test2,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test)
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
FIN : à changer en utilisant
Une piste d’amélioration est de changer le seuil dans la transformation en binaire des predictions de la couche de classification. Attention à bien faire la recherche du seuil sur le train.
En fait, il faudrait en cross-validation sur le train.
propositions_seuil = seq(0,1,0.05)
errors_recherche_seuil = rep(0,5)
for (seuil in propositions_seuil ){
pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
seuil)
errors_recherche_seuil = rbind(errors_recherche_seuil,c(seuil,pred$errors_hurdle01))
}
errors_recherche_seuil = errors_recherche_seuil[-1,]
print(errors_recherche_seuil)
logMSE_train logMSE_test MAPE_train MAPE_test
0 0.4938123 0.4950772 1.004958 1.007423
0.05 0.2270736 0.230549 0.4502807 0.4576359
0.1 0.04395753 0.0440701 0.06443343 0.06431263
0.15 0.02883019 0.02837623 0.0311437 0.0301805
0.2 0.02700346 0.02638802 0.02685265 0.02593927
0.25 0.02677354 0.02586918 0.02617944 0.02491637
0.3 0.02673074 0.02592902 0.02595924 0.0248458
0.35 0.02672835 0.02594246 0.02593319 0.02482605
0.4 0.02674193 0.02596599 0.02593785 0.02484583
0.45 0.02676755 0.02596599 0.0259533 0.02484583
0.5 0.02676755 0.02596599 0.0259533 0.02484583
0.55 0.02676755 0.02596599 0.0259533 0.02484583
0.6 0.02677333 0.02596599 0.02595779 0.02484583
0.65 0.02677333 0.02596599 0.02595779 0.02484583
0.7 0.02678792 0.02596599 0.02596385 0.02484583
0.75 0.02678792 0.02596599 0.02596385 0.02484583
0.8 0.02678792 0.02596599 0.02596385 0.02484583
0.85 0.02678792 0.02596599 0.02596385 0.02484583
0.9 0.02678792 0.02596599 0.02596385 0.02484583
0.95 0.02678792 0.02596599 0.02596385 0.02484583
1 0.02678792 0.02596599 0.02596385 0.02484583
plot(propositions_seuil,errors_recherche_seuil[,2],type ="l")
propositions_seuil[which.min(errors_recherche_seuil[,2])]
[1] 0.35
errors_recherche_seuil[which.min(errors_recherche_seuil[,2]),]
[[1]]
[1] 0.35
$logMSE_train
[1] 0.02672835
$logMSE_test
[1] 0.02594246
$MAPE_train
[1] 0.02593319
$MAPE_test
[1] 0.02482605
errors_all
logMSE_train logMSE_test MAPE_train MAPE_test
errors_full_offset 0.02530934 0.02477317 0.0748177 0.07426603
errors_full 0.02530934 0.02477317 0.0748177 0.07426603
errors_aic 0.02531901 0.0247661 0.0748456 0.07424057
errors_full_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_full_hurdle2 0.02676755 0.02596599 0.0259533 0.02484583
errors_full_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_full_hurdle2 0.2270736 0.230549 0.4502807 0.4576359
0.02676755 0.02596599 0.0259533 0.02484583
0.02530432 0.02475668 0.07476975 0.07414931
0.02676755 0.02596599 0.0259533 0.02484583
errors_hurdle 0.02530432 0.02475668 0.07476975 0.07414931
errors_hurdle01 0.02676755 0.02596599 0.0259533 0.02484583
pred_035 = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
seuil=0.35)
data_plot_train = tibble(predictions = pred_035$prediction_train01 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train)
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
print(c(mean(pred_035$prediction_train01>0),mean(pred_035$prediction_test01>0)))
[1] 7.5e-05 5.0e-05
Autres pistes =
faire de la selection par AIC dans les 2 couches du modèle Hurdle
faire du features engineering : regrouper certaines modalités (“statistique”), binariser les features continues, considérer les interactions,…
ensuite considérer les pénalisations (ridge, lasso, enet)