library(tidyverse)
library(caret)
library(MASS)
library(glmnet)
source("./fonctionsTP2.R")

Exercice 1 : import des données et création d’une sous-base

Chargement des données

claims = read_csv("../Data Claims/freMTPL2freq.csv",progress = FALSE)
Parsed with column specification:
cols(
  IDpol = col_double(),
  ClaimNb = col_double(),
  Exposure = col_double(),
  Area = col_character(),
  VehPower = col_double(),
  VehAge = col_double(),
  DrivAge = col_double(),
  BonusMalus = col_double(),
  VehBrand = col_character(),
  VehGas = col_character(),
  Density = col_double(),
  Region = col_character()
)
glimpse(claims)
Observations: 678,013
Variables: 12
$ IDpol      <dbl> 1, 3, 5, 10, 11, 13, 15, 17, 18, 21, 25, 27, 30, 32, 35, 36, 38, 42, 44, 45, 47, 49, 50, 52, 53...
$ ClaimNb    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,...
$ Exposure   <dbl> 0.10, 0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15, 0.75, 0.87, 0.81, 0.05, 0.76, 0.34,...
$ Area       <chr> "'D'", "'D'", "'B'", "'B'", "'B'", "'E'", "'E'", "'C'", "'C'", "'B'", "'B'", "'C'", "'D'", "'D'...
$ VehPower   <dbl> 5, 5, 6, 7, 7, 6, 6, 7, 7, 7, 7, 7, 4, 4, 4, 9, 6, 6, 6, 6, 6, 7, 7, 6, 5, 5, 5, 5, 5, 5, 5, 15...
$ VehAge     <dbl> 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 9, 0, 2, 2, 2, 2, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,...
$ DrivAge    <dbl> 55, 55, 52, 46, 46, 38, 38, 33, 33, 41, 41, 56, 27, 27, 23, 44, 32, 32, 55, 55, 55, 73, 73, 27,...
$ BonusMalus <dbl> 50, 50, 50, 50, 50, 50, 50, 68, 68, 50, 50, 50, 90, 90, 100, 76, 56, 56, 50, 50, 50, 50, 50, 76...
$ VehBrand   <chr> "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12'", "'B12...
$ VehGas     <chr> "'Regular'", "'Regular'", "'Diesel'", "'Diesel'", "'Diesel'", "'Regular'", "'Regular'", "'Diese...
$ Density    <dbl> 1217, 1217, 54, 76, 76, 3003, 3003, 137, 137, 60, 60, 173, 695, 695, 7887, 27000, 23, 23, 37, 3...
$ Region     <chr> "'R82'", "'R82'", "'R22'", "'R72'", "'R72'", "'R31'", "'R31'", "'R91'", "'R91'", "'R52'", "'R52...

Transformation des types mal interprétés

On laisse pour l’instant la variable VehPower inchangée.

claims = claims%>% 
        mutate(VehBrand  = factor(VehBrand )) %>%
        mutate(VehGas = factor(VehGas)) %>% 
        mutate(Region = factor(Region)) %>%
        mutate(Frequency = ClaimNb / Exposure)

Création d’une sous base

set.seed(123)
nombre_small = 100000
smallIndex = sample(claims$IDpol, size = nombre_small)
claims_small = claims %>% filter(IDpol %in% smallIndex)
rm(claims)

Valeurs manquantes

claims_small %>% summarise_all(funs(sum(is.na(.))))

Exercice 2 : représentations graphiques

Distribution de ClaimNb

ggplot(claims_small,aes(ClaimNb)) + geom_bar() + scale_y_log10()

Distribution de Exposure

ggplot(claims_small,aes(Exposure,after_stat(density),)) + geom_histogram()

Distribution de Frequency

ggplot(claims_small,aes(Frequency)) + geom_histogram() + scale_y_log10()

print(paste0("Proportion d'observations sans claim= ",round(mean(claims_small$ClaimNb==0),digits = 4)))
[1] "Proportion d'observations sans claim= 0.9496"
print(paste0("Moyenne du nombre de claim= ",round(mean(claims_small$ClaimNb,na.rm = T),digits = 4)))
[1] "Moyenne du nombre de claim= 0.0538"
print(paste(c("Probabilite pour une loi de Poisson de moyenne ",
              round(mean(claims_small$ClaimNb,na.rm = T),digits = 3),
              " d'observer 0 est ",
              round(exp(-mean(claims_small$ClaimNb,na.rm = T)),digits =3)),
            collapse = ""))
[1] "Probabilite pour une loi de Poisson de moyenne 0.054 d'observer 0 est 0.948"

Lien de Frequency avec les autres variables

ggplot(claims_small,aes(Area,Frequency)) + geom_boxplot() + scale_y_log10()

ggplot(claims_small,aes(VehPower,Frequency)) + geom_point() + scale_y_log10()

ggplot(claims_small,aes(VehAge,Frequency)) + geom_point() + scale_y_log10()

ggplot(claims_small,aes(DrivAge,Frequency)) + geom_point() + scale_y_log10()

ggplot(claims_small,aes(BonusMalus,Frequency)) + geom_point() + scale_y_log10()

ggplot(claims_small,aes(VehBrand,Frequency)) + geom_boxplot()  + scale_y_log10()

ggplot(claims_small,aes(VehGas,Frequency)) + geom_boxplot()  + scale_y_log10()

ggplot(claims_small,aes(Density,Frequency)) + geom_point()  + scale_y_log10() 

ggplot(claims_small,aes(Density,Frequency)) + geom_point()  + scale_y_log10() + scale_x_log10()

claims_small = claims_small %>% mutate(LogDensity = log(Density))
ggplot(claims_small,aes(Region,Frequency)) + geom_boxplot()  + scale_y_log10()

Exercice 3

On prend le log de la variable Exposure

claims_small = claims_small %>% mutate(LogExposure = log(Exposure))
claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
set.seed(432)
trainIndex<- createDataPartition(claims_small$IDpol, p = 0.8, 
                                  list = FALSE)
length(trainIndex)
[1] 80000
claims_small_train = claims_small[trainIndex,]
claims_small_test = claims_small[-trainIndex,]
glimpse(claims_small_train)
Observations: 80,000
Variables: 16
$ IDpol         <dbl> 59, 67, 68, 78, 92, 108, 121, 145, 167, 209, 212, 246, 256, 286, 295, 299, 307, 311, 325, 33...
$ ClaimNb       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,...
$ Exposure      <dbl> 0.79, 0.80, 0.07, 0.16, 0.41, 0.02, 0.05, 0.81, 0.77, 0.34, 0.14, 0.08, 0.85, 0.75, 0.04, 0....
$ Area          <chr> "'C'", "'D'", "'D'", "'A'", "'B'", "'C'", "'C'", "'A'", "'E'", "'D'", "'E'", "'A'", "'E'", "...
$ VehPower      <dbl> 5, 5, 5, 6, 4, 9, 6, 6, 4, 4, 10, 7, 6, 5, 5, 4, 6, 6, 5, 4, 4, 7, 9, 9, 9, 5, 4, 7, 10, 11,...
$ VehAge        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 4, 0, 3, 6, 0, 0, 0, 0, 0, 0, 1, 9, 8, 0, 0, 0, 7, 4, 10...
$ DrivAge       <dbl> 59, 69, 69, 60, 30, 45, 37, 62, 55, 48, 32, 38, 35, 41, 29, 26, 50, 58, 35, 52, 29, 32, 46, ...
$ BonusMalus    <dbl> 50, 52, 52, 51, 80, 50, 55, 90, 50, 50, 50, 60, 68, 90, 60, 100, 50, 50, 50, 90, 76, 58, 50,...
$ VehBrand      <fct> 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B12', 'B5', 'B...
$ VehGas        <fct> 'Regular', 'Regular', 'Regular', 'Diesel', 'Regular', 'Regular', 'Diesel', 'Regular', 'Regul...
$ Density       <dbl> 455, 1376, 1376, 12, 77, 120, 303, 49, 2715, 1622, 2757, 27, 9307, 27000, 63, 2308, 8880, 23...
$ Region        <fct> 'R91', 'R11', 'R11', 'R83', 'R93', 'R83', 'R11', 'R53', 'R93', 'R91', 'R22', 'R11', 'R82', '...
$ Frequency     <dbl> 1.265823, 1.250000, 14.285714, 6.250000, 2.439024, 50.000000, 20.000000, 1.234568, 1.298701,...
$ LogDensity    <dbl> 6.120297, 7.226936, 7.226936, 2.484907, 4.343805, 4.787492, 5.713733, 3.891820, 7.906547, 7....
$ LogExposure   <dbl> -0.2357223, -0.2231436, -2.6592600, -1.8325815, -0.8915981, -3.9120230, -2.9957323, -0.21072...
$ Claim_binaire <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...

Modèle full

model_full_offset = glm(ClaimNb ~ offset(LogExposure) + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full_offset)

Call:
glm(formula = ClaimNb ~ offset(LogExposure) + Area + VehPower + 
    VehAge + DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, 
    family = poisson(), data = claims_small_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2187  -0.3888  -0.2974  -0.1626  13.3317  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.9446153  0.1625753 -24.263  < 2e-16 ***
Area'B'          0.0165116  0.0738016   0.224 0.822968    
Area'C'          0.1205516  0.0946131   1.274 0.202609    
Area'D'          0.2774197  0.1433999   1.935 0.053041 .  
Area'E'          0.3271010  0.1907226   1.715 0.086334 .  
Area'F'          0.3693692  0.2609347   1.416 0.156904    
VehPower         0.0318559  0.0078226   4.072 4.66e-05 ***
VehAge          -0.0381663  0.0033650 -11.342  < 2e-16 ***
DrivAge          0.0051649  0.0011433   4.518 6.25e-06 ***
BonusMalus       0.0226753  0.0008881  25.532  < 2e-16 ***
VehBrand'B10'    0.1029516  0.0995720   1.034 0.301163    
VehBrand'B11'   -0.1712715  0.1248635  -1.372 0.170166    
VehBrand'B12'    0.1649006  0.0481364   3.426 0.000613 ***
VehBrand'B13'   -0.0106295  0.1210972  -0.088 0.930054    
VehBrand'B14'   -0.5372948  0.2692375  -1.996 0.045976 *  
VehBrand'B2'     0.0387875  0.0442934   0.876 0.381196    
VehBrand'B3'     0.0024469  0.0626301   0.039 0.968835    
VehBrand'B4'    -0.1354487  0.0901096  -1.503 0.132799    
VehBrand'B5'     0.1144411  0.0707999   1.616 0.106007    
VehBrand'B6'    -0.0495408  0.0818699  -0.605 0.545102    
VehGas'Regular'  0.0715168  0.0315835   2.264 0.023551 *  
LogDensity      -0.0219868  0.0358149  -0.614 0.539280    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 26657  on 79999  degrees of freedom
Residual deviance: 25727  on 79978  degrees of freedom
AIC: 34069

Number of Fisher Scoring iterations: 6
y_pred_train_offset = predict(model_full_offset,type="response")
y_true_train = claims_small_train$ClaimNb
y_pred_test_offset = predict(model_full_offset,type="response",claims_small_test)
y_true_test = claims_small_test$ClaimNb
errors_full_offset = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
print(errors_full_offset)
$logMSE_train
[1] 0.02530934

$logMSE_test
[1] 0.02477317

$MAPE_train
[1] 0.0748177

$MAPE_test
[1] 0.07426603
model_full = glm(ClaimNb ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full)

Call:
glm(formula = ClaimNb ~ LogExposure + Area + VehPower + VehAge + 
    DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = poisson(), 
    data = claims_small_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9435  -0.3637  -0.3131  -0.2478  12.7784  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1728794  0.1639109 -25.458  < 2e-16 ***
LogExposure      0.4222318  0.0180774  23.357  < 2e-16 ***
Area'B'          0.0092183  0.0738301   0.125  0.90064    
Area'C'          0.0838926  0.0946919   0.886  0.37564    
Area'D'          0.2162977  0.1435014   1.507  0.13174    
Area'E'          0.2392389  0.1908972   1.253  0.21012    
Area'F'          0.2928437  0.2610425   1.122  0.26194    
VehPower         0.0279580  0.0077867   3.590  0.00033 ***
VehAge          -0.0339298  0.0033316 -10.184  < 2e-16 ***
DrivAge          0.0079835  0.0011560   6.906 4.97e-12 ***
BonusMalus       0.0201948  0.0009105  22.180  < 2e-16 ***
VehBrand'B10'    0.0843090  0.0994848   0.847  0.39674    
VehBrand'B11'   -0.2077645  0.1248665  -1.664  0.09613 .  
VehBrand'B12'   -0.0093186  0.0485502  -0.192  0.84779    
VehBrand'B13'   -0.0188085  0.1210689  -0.155  0.87654    
VehBrand'B14'   -0.5587437  0.2692469  -2.075  0.03797 *  
VehBrand'B2'     0.0417314  0.0442846   0.942  0.34602    
VehBrand'B3'    -0.0071772  0.0626378  -0.115  0.90878    
VehBrand'B4'    -0.1372335  0.0901053  -1.523  0.12775    
VehBrand'B5'     0.1143009  0.0707897   1.615  0.10639    
VehBrand'B6'    -0.0543124  0.0818610  -0.663  0.50703    
VehGas'Regular'  0.0992080  0.0316000   3.139  0.00169 ** 
LogDensity      -0.0196519  0.0358497  -0.548  0.58357    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 26053  on 79999  degrees of freedom
Residual deviance: 24928  on 79977  degrees of freedom
AIC: 33272

Number of Fisher Scoring iterations: 6
y_pred_train = predict(model_full,type="response")
y_pred_test = predict(model_full,type="response",claims_small_test)
errors_full = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
errors_all = rbind(errors_full_offset,errors_full)
print(errors_all)
                   logMSE_train logMSE_test MAPE_train MAPE_test 
errors_full_offset 0.02530934   0.02477317  0.0748177  0.07426603
errors_full        0.02530934   0.02477317  0.0748177  0.07426603

Modèle AIC from full

model_full_aic = stepAIC(model_full,trace = F)

Il faut sauvegarder le modèle aic et le recharger ensuite …

y_pred_train_aic = predict(model_full_aic,type="response")
y_pred_test_aic = predict(model_full_aic,type="response",claims_small_test)
errors_aic = errors(y_pred_train_aic,y_pred_test_aic,y_true_train,y_true_test)
errors_all = rbind(errors_all,errors_aic)
print(errors_all)
                   logMSE_train logMSE_test MAPE_train MAPE_test 
errors_full_offset 0.02530934   0.02477317  0.0748177  0.07426603
errors_full        0.02530934   0.02477317  0.0748177  0.07426603
errors_aic         0.02531901   0.0247661   0.0748456  0.07424057

Exercice 4 : modèle de Hurdle

Couche de classification

#claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
model_full_classif = glm(Claim_binaire ~ Exposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = binomial())
summary(model_full_classif)

Call:
glm(formula = Claim_binaire ~ Exposure + Area + VehPower + VehAge + 
    DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = binomial(), 
    data = claims_small_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9274  -0.3609  -0.2990  -0.2404   3.0512  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.373227   0.180486 -29.771  < 2e-16 ***
Exposure         1.250664   0.050168  24.929  < 2e-16 ***
Area'B'          0.004060   0.078189   0.052 0.958586    
Area'C'          0.100161   0.100610   0.996 0.319479    
Area'D'          0.247304   0.152701   1.620 0.105332    
Area'E'          0.293158   0.203247   1.442 0.149198    
Area'F'          0.373741   0.278195   1.343 0.179126    
VehPower         0.030679   0.008327   3.684 0.000229 ***
VehAge          -0.036066   0.003547 -10.169  < 2e-16 ***
DrivAge          0.007982   0.001248   6.397 1.59e-10 ***
BonusMalus       0.021859   0.001028  21.262  < 2e-16 ***
VehBrand'B10'    0.139344   0.105170   1.325 0.185189    
VehBrand'B11'   -0.183027   0.131488  -1.392 0.163933    
VehBrand'B12'    0.051637   0.052504   0.983 0.325367    
VehBrand'B13'    0.017586   0.127955   0.137 0.890682    
VehBrand'B14'   -0.503501   0.274834  -1.832 0.066949 .  
VehBrand'B2'     0.052407   0.047265   1.109 0.267525    
VehBrand'B3'     0.019282   0.066669   0.289 0.772413    
VehBrand'B4'    -0.124121   0.095369  -1.301 0.193094    
VehBrand'B5'     0.144247   0.075604   1.908 0.056398 .  
VehBrand'B6'    -0.027815   0.086742  -0.321 0.748469    
VehGas'Regular'  0.097741   0.033720   2.899 0.003749 ** 
LogDensity      -0.029704   0.038172  -0.778 0.436467    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 32191  on 79999  degrees of freedom
Residual deviance: 31127  on 79977  degrees of freedom
AIC: 31173

Number of Fisher Scoring iterations: 6

Couche poissonnienne

print(dim(claims_small_train))
[1] 80000    16
claims_small_train_hurdle = claims_small_train %>% filter(ClaimNb>=1)
print(dim(claims_small_train_hurdle))
[1] 4073   16
model_full_hurdle = glm(ClaimNb-1 ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train_hurdle,family = poisson())
summary(model_full_hurdle)

Call:
glm(formula = ClaimNb - 1 ~ LogExposure + Area + VehPower + VehAge + 
    DrivAge + BonusMalus + VehBrand + VehGas + LogDensity, family = poisson(), 
    data = claims_small_train_hurdle)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7510  -0.3827  -0.3366  -0.2934  11.7475  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.050e+00  6.663e-01  -6.079 1.21e-09 ***
LogExposure      1.520e-01  7.692e-02   1.975   0.0482 *  
Area'B'          1.524e-01  3.255e-01   0.468   0.6397    
Area'C'          9.649e-02  4.046e-01   0.238   0.8115    
Area'D'          1.080e-01  5.967e-01   0.181   0.8564    
Area'E'         -9.329e-02  7.889e-01  -0.118   0.9059    
Area'F'         -3.278e-01  1.076e+00  -0.305   0.7607    
VehPower         2.209e-03  3.285e-02   0.067   0.9464    
VehAge          -4.139e-03  1.319e-02  -0.314   0.7536    
DrivAge         -7.087e-04  4.706e-03  -0.151   0.8803    
BonusMalus       1.352e-02  3.080e-03   4.390 1.13e-05 ***
VehBrand'B10'   -6.769e-01  5.246e-01  -1.290   0.1969    
VehBrand'B11'   -4.792e-01  5.956e-01  -0.805   0.4210    
VehBrand'B12'    1.960e-01  1.958e-01   1.001   0.3166    
VehBrand'B13'   -5.101e-01  5.939e-01  -0.859   0.3904    
VehBrand'B14'   -1.255e+01  3.351e+02  -0.037   0.9701    
VehBrand'B2'    -1.405e-01  1.791e-01  -0.785   0.4326    
VehBrand'B3'    -3.053e-01  2.671e-01  -1.143   0.2531    
VehBrand'B4'    -3.240e-01  3.981e-01  -0.814   0.4158    
VehBrand'B5'    -2.791e-01  3.037e-01  -0.919   0.3580    
VehBrand'B6'    -4.392e-01  3.755e-01  -1.170   0.2421    
VehGas'Regular' -3.683e-02  1.294e-01  -0.285   0.7759    
LogDensity       9.837e-02  1.482e-01   0.664   0.5068    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1543.2  on 4072  degrees of freedom
Residual deviance: 1499.5  on 4050  degrees of freedom
AIC: 2028.2

Number of Fisher Scoring iterations: 13
pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test)
errors_hurdle = pred$errors_hurdle
errors_hurdle01 = pred$errors_hurdle01
errors_all = rbind(errors_all,errors_hurdle)
errors_all = rbind(errors_all,errors_hurdle01)
print(errors_all)
                    logMSE_train logMSE_test MAPE_train MAPE_test 
errors_full_offset  0.02530934   0.02477317  0.0748177  0.07426603
errors_full         0.02530934   0.02477317  0.0748177  0.07426603
errors_aic          0.02531901   0.0247661   0.0748456  0.07424057
errors_full_hurdle  0.02530432   0.02475668  0.07476975 0.07414931
errors_full_hurdle2 0.02676755   0.02596599  0.0259533  0.02484583
errors_full_hurdle  0.02530432   0.02475668  0.07476975 0.07414931
errors_full_hurdle2 0.2270736    0.230549    0.4502807  0.4576359 
                    0.02676755   0.02596599  0.0259533  0.02484583
                    0.02530432   0.02475668  0.07476975 0.07414931
                    0.02676755   0.02596599  0.0259533  0.02484583
errors_hurdle       0.02530432   0.02475668  0.07476975 0.07414931
errors_hurdle01     0.02676755   0.02596599  0.0259533  0.02484583

A changer en utilisant

pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test)

data_plot_train = tibble(predictions = y_pred_full_hurdle_train ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
data_plot_test = tibble(predictions = y_pred_full_hurdle_test,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test) 
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
data_plot_train = tibble(predictions = y_pred_full_hurdle_train2 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
#print(c(mean(y_pred_full_hurdle_train>0),mean(y_pred_full_hurdle_test>0)))
print(c(mean(y_pred_full_hurdle_train2>0),mean(y_pred_full_hurdle_test2>0)))
data_plot_test = tibble(predictions = y_pred_full_hurdle_test2,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test) 
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()

FIN : à changer en utilisant

Recherche d’un seuil

Une piste d’amélioration est de changer le seuil dans la transformation en binaire des predictions de la couche de classification. Attention à bien faire la recherche du seuil sur le train.

En fait, il faudrait en cross-validation sur le train.

propositions_seuil = seq(0,1,0.05)
errors_recherche_seuil = rep(0,5)
for (seuil in propositions_seuil ){
  pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
                                   seuil)
  errors_recherche_seuil = rbind(errors_recherche_seuil,c(seuil,pred$errors_hurdle01))
}
errors_recherche_seuil = errors_recherche_seuil[-1,]
print(errors_recherche_seuil)
      logMSE_train logMSE_test MAPE_train MAPE_test 
 0    0.4938123    0.4950772   1.004958   1.007423  
 0.05 0.2270736    0.230549    0.4502807  0.4576359 
 0.1  0.04395753   0.0440701   0.06443343 0.06431263
 0.15 0.02883019   0.02837623  0.0311437  0.0301805 
 0.2  0.02700346   0.02638802  0.02685265 0.02593927
 0.25 0.02677354   0.02586918  0.02617944 0.02491637
 0.3  0.02673074   0.02592902  0.02595924 0.0248458 
 0.35 0.02672835   0.02594246  0.02593319 0.02482605
 0.4  0.02674193   0.02596599  0.02593785 0.02484583
 0.45 0.02676755   0.02596599  0.0259533  0.02484583
 0.5  0.02676755   0.02596599  0.0259533  0.02484583
 0.55 0.02676755   0.02596599  0.0259533  0.02484583
 0.6  0.02677333   0.02596599  0.02595779 0.02484583
 0.65 0.02677333   0.02596599  0.02595779 0.02484583
 0.7  0.02678792   0.02596599  0.02596385 0.02484583
 0.75 0.02678792   0.02596599  0.02596385 0.02484583
 0.8  0.02678792   0.02596599  0.02596385 0.02484583
 0.85 0.02678792   0.02596599  0.02596385 0.02484583
 0.9  0.02678792   0.02596599  0.02596385 0.02484583
 0.95 0.02678792   0.02596599  0.02596385 0.02484583
 1    0.02678792   0.02596599  0.02596385 0.02484583
plot(propositions_seuil,errors_recherche_seuil[,2],type ="l")

propositions_seuil[which.min(errors_recherche_seuil[,2])]
[1] 0.35
errors_recherche_seuil[which.min(errors_recherche_seuil[,2]),]
[[1]]
[1] 0.35

$logMSE_train
[1] 0.02672835

$logMSE_test
[1] 0.02594246

$MAPE_train
[1] 0.02593319

$MAPE_test
[1] 0.02482605
errors_all
                    logMSE_train logMSE_test MAPE_train MAPE_test 
errors_full_offset  0.02530934   0.02477317  0.0748177  0.07426603
errors_full         0.02530934   0.02477317  0.0748177  0.07426603
errors_aic          0.02531901   0.0247661   0.0748456  0.07424057
errors_full_hurdle  0.02530432   0.02475668  0.07476975 0.07414931
errors_full_hurdle2 0.02676755   0.02596599  0.0259533  0.02484583
errors_full_hurdle  0.02530432   0.02475668  0.07476975 0.07414931
errors_full_hurdle2 0.2270736    0.230549    0.4502807  0.4576359 
                    0.02676755   0.02596599  0.0259533  0.02484583
                    0.02530432   0.02475668  0.07476975 0.07414931
                    0.02676755   0.02596599  0.0259533  0.02484583
errors_hurdle       0.02530432   0.02475668  0.07476975 0.07414931
errors_hurdle01     0.02676755   0.02596599  0.0259533  0.02484583
pred_035 = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
                                   seuil=0.35)
data_plot_train = tibble(predictions = pred_035$prediction_train01 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()

print(c(mean(pred_035$prediction_train01>0),mean(pred_035$prediction_test01>0)))
[1] 7.5e-05 5.0e-05

Autres pistes =

  • faire de la selection par AIC dans les 2 couches du modèle Hurdle

  • faire du features engineering : regrouper certaines modalités (“statistique”), binariser les features continues, considérer les interactions,…

  • ensuite considérer les pénalisations (ridge, lasso, enet)

---
title: "TD TP 2 : Régression de Poisson et inflation de zéro (correction)"
output: html_notebook
---

```{r}
library(tidyverse)
library(caret)
library(MASS)
library(glmnet)
source("./fonctionsTP2.R")
```

# Exercice 1 : import des données et création d'une sous-base
#### Chargement des données
```{r}
claims = read_csv("../Data Claims/freMTPL2freq.csv",progress = FALSE)
glimpse(claims)
```
#### Transformation des types mal interprétés
On laisse pour l'instant la variable `VehPower` inchangée.
```{r}
claims = claims%>% 
        mutate(VehBrand  = factor(VehBrand )) %>%
        mutate(VehGas = factor(VehGas)) %>% 
        mutate(Region = factor(Region)) %>%
        mutate(Frequency = ClaimNb / Exposure)
```


### Création d'une sous base 
```{r}
set.seed(123)
nombre_small = 100000
smallIndex = sample(claims$IDpol, size = nombre_small)
claims_small = claims %>% filter(IDpol %in% smallIndex)
rm(claims)
```
Valeurs manquantes
```{r}
claims_small %>% summarise_all(funs(sum(is.na(.))))
```
# Exercice 2 : représentations graphiques
Distribution de `ClaimNb`
```{r}
ggplot(claims_small,aes(ClaimNb)) + geom_bar() + scale_y_log10()
```
Distribution de `Exposure`
```{r}
ggplot(claims_small,aes(Exposure,after_stat(density),)) + geom_histogram()
```
Distribution de `Frequency`
```{r}
ggplot(claims_small,aes(Frequency)) + geom_histogram() + scale_y_log10()
```

```{r}
print(paste0("Proportion d'observations sans claim= ",round(mean(claims_small$ClaimNb==0),digits = 4)))
```
```{r}
print(paste0("Moyenne du nombre de claim= ",round(mean(claims_small$ClaimNb,na.rm = T),digits = 4)))
```
```{r}
print(paste(c("Probabilite pour une loi de Poisson de moyenne ",
              round(mean(claims_small$ClaimNb,na.rm = T),digits = 3),
              " d'observer 0 est ",
              round(exp(-mean(claims_small$ClaimNb,na.rm = T)),digits =3)),
            collapse = ""))
```

Lien de `Frequency` avec les autres variables
```{r}
ggplot(claims_small,aes(Area,Frequency)) + geom_boxplot() + scale_y_log10()
```
```{r}
ggplot(claims_small,aes(VehPower,Frequency)) + geom_point() + scale_y_log10()
```
```{r}
ggplot(claims_small,aes(VehAge,Frequency)) + geom_point() + scale_y_log10()
```
```{r}
ggplot(claims_small,aes(DrivAge,Frequency)) + geom_point() + scale_y_log10()
```

```{r}
ggplot(claims_small,aes(BonusMalus,Frequency)) + geom_point() + scale_y_log10()
```
```{r}
ggplot(claims_small,aes(VehBrand,Frequency)) + geom_boxplot()  + scale_y_log10()
```
```{r}
ggplot(claims_small,aes(VehGas,Frequency)) + geom_boxplot()  + scale_y_log10()
```

```{r}
ggplot(claims_small,aes(Density,Frequency)) + geom_point()  + scale_y_log10() 
```
```{r}
ggplot(claims_small,aes(Density,Frequency)) + geom_point()  + scale_y_log10() + scale_x_log10()
claims_small = claims_small %>% mutate(LogDensity = log(Density))
```
```{r}
ggplot(claims_small,aes(Region,Frequency)) + geom_boxplot()  + scale_y_log10()
```
# Exercice 3
On prend le log de la variable `Exposure`
```{r}
claims_small = claims_small %>% mutate(LogExposure = log(Exposure))
claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
```

```{r}
set.seed(432)
trainIndex<- createDataPartition(claims_small$IDpol, p = 0.8, 
                                  list = FALSE)
length(trainIndex)
claims_small_train = claims_small[trainIndex,]
claims_small_test = claims_small[-trainIndex,]

```

```{r}
glimpse(claims_small_train)
```
## Modèle full
```{r}
model_full_offset = glm(ClaimNb ~ offset(LogExposure) + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full_offset)
```
```{r}
y_pred_train_offset = predict(model_full_offset,type="response")
y_true_train = claims_small_train$ClaimNb
y_pred_test_offset = predict(model_full_offset,type="response",claims_small_test)
y_true_test = claims_small_test$ClaimNb
errors_full_offset = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
print(errors_full_offset)
```

```{r}
model_full = glm(ClaimNb ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = poisson())
summary(model_full)
```
```{r}
y_pred_train = predict(model_full,type="response")
y_pred_test = predict(model_full,type="response",claims_small_test)
errors_full = errors(y_pred_train,y_pred_test,y_true_train,y_true_test)
errors_all = rbind(errors_full_offset,errors_full)
print(errors_all)
```
## Modèle AIC from full
```{r}
model_full_aic = stepAIC(model_full,trace = F)
```

########### Il faut sauvegarder le modèle aic et le recharger ensuite ...

```{r}
y_pred_train_aic = predict(model_full_aic,type="response")
y_pred_test_aic = predict(model_full_aic,type="response",claims_small_test)
errors_aic = errors(y_pred_train_aic,y_pred_test_aic,y_true_train,y_true_test)
errors_all = rbind(errors_all,errors_aic)
print(errors_all)
```
# Exercice 4 : modèle de Hurdle
## Couche de classification
```{r}
#claims_small = claims_small %>% mutate(Claim_binaire = ClaimNb>=1)
model_full_classif = glm(Claim_binaire ~ Exposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train,family = binomial())
summary(model_full_classif)
```


## Couche poissonnienne
```{r}
print(dim(claims_small_train))
claims_small_train_hurdle = claims_small_train %>% filter(ClaimNb>=1)
print(dim(claims_small_train_hurdle))
```
```{r}
model_full_hurdle = glm(ClaimNb-1 ~ LogExposure + Area + VehPower + VehAge + DrivAge + BonusMalus + VehBrand
                          + VehGas + LogDensity, data =claims_small_train_hurdle,family = poisson())
summary(model_full_hurdle)
```

```{r}
pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test)
errors_hurdle = pred$errors_hurdle
errors_hurdle01 = pred$errors_hurdle01
errors_all = rbind(errors_all,errors_hurdle)
errors_all = rbind(errors_all,errors_hurdle01)
print(errors_all)
```
################### A changer en utilisant 
# pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test)

```{r}
data_plot_train = tibble(predictions = y_pred_full_hurdle_train ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
```

```{r}
data_plot_test = tibble(predictions = y_pred_full_hurdle_test,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test) 
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
```
```{r}
data_plot_train = tibble(predictions = y_pred_full_hurdle_train2 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
```
```{r}
#print(c(mean(y_pred_full_hurdle_train>0),mean(y_pred_full_hurdle_test>0)))
print(c(mean(y_pred_full_hurdle_train2>0),mean(y_pred_full_hurdle_test2>0)))
```

```{r}
data_plot_test = tibble(predictions = y_pred_full_hurdle_test2,truth = claims_small_test$ClaimNb)
data_plot_test = gather(data_plot_test) 
ggplot(data_plot_test,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
```
################### FIN : à changer en utilisant 

## Recherche d'un seuil

Une piste d'amélioration est de changer le seuil dans la transformation en binaire des predictions de la couche de classification. Attention à bien faire la recherche du seuil sur le train.

En fait, il faudrait en cross-validation sur le train.
```{r}
propositions_seuil = seq(0,1,0.05)
errors_recherche_seuil = rep(0,5)
for (seuil in propositions_seuil ){
  pred = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
                                   seuil)
  errors_recherche_seuil = rbind(errors_recherche_seuil,c(seuil,pred$errors_hurdle01))
}
errors_recherche_seuil = errors_recherche_seuil[-1,]
print(errors_recherche_seuil)
plot(propositions_seuil,errors_recherche_seuil[,2],type ="l")
propositions_seuil[which.min(errors_recherche_seuil[,2])]
```
```{r}
errors_recherche_seuil[which.min(errors_recherche_seuil[,2]),]
```

```{r}
errors_all
```

```{r}
pred_035 = predictions_scores_hurdle(model_full_classif,model_full_hurdle,claims_small_train,claims_small_test,
                                   seuil=0.35)
```

```{r}
data_plot_train = tibble(predictions = pred_035$prediction_train01 ,truth = claims_small_train$ClaimNb)
data_plot_train = gather(data_plot_train) 
ggplot(data_plot_train,aes(value,fill=key)) + geom_histogram() + scale_y_log10()
```

```{r}
print(c(mean(pred_035$prediction_train01>0),mean(pred_035$prediction_test01>0)))
```

Autres pistes =

- faire de la selection par AIC dans les 2 couches du modèle Hurdle

- faire du features engineering : regrouper certaines modalités ("statistique"), binariser les features continues, considérer les interactions,...

- ensuite considérer les pénalisations (ridge, lasso, enet)



