library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.1 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 1.0.0 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
diet = read_csv("./diet.csv")
Parsed with column specification:
cols(
Person = [32mcol_double()[39m,
gender = [32mcol_double()[39m,
Age = [32mcol_double()[39m,
Height = [32mcol_double()[39m,
pre.weight = [32mcol_double()[39m,
Diet = [32mcol_double()[39m,
weight6weeks = [32mcol_double()[39m
)
On vérifie le formats des variables
glimpse(diet)
Observations: 78
Variables: 7
$ Person [3m[38;5;246m<dbl>[39m[23m 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 27, 28, 29…
$ gender [3m[38;5;246m<dbl>[39m[23m NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Age [3m[38;5;246m<dbl>[39m[23m 41, 32, 22, 46, 55, 33, 50, 50, 37, 28, 28, 45, 60, 48, 41, 37, 4…
$ Height [3m[38;5;246m<dbl>[39m[23m 171, 174, 159, 192, 170, 171, 170, 201, 174, 176, 165, 165, 173, …
$ pre.weight [3m[38;5;246m<dbl>[39m[23m 60, 103, 58, 60, 64, 64, 65, 66, 67, 69, 70, 70, 72, 72, 72, 82, …
$ Diet [3m[38;5;246m<dbl>[39m[23m 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
$ weight6weeks [3m[38;5;246m<dbl>[39m[23m 60.0, 103.0, 54.2, 54.0, 63.3, 61.1, 62.2, 64.0, 65.0, 60.5, 68.1…
summary(diet)
Person gender Age Height pre.weight
Min. : 1.00 Min. :0.0000 Min. :16.00 Min. :141.0 Min. : 58.00
1st Qu.:20.25 1st Qu.:0.0000 1st Qu.:32.25 1st Qu.:164.2 1st Qu.: 66.00
Median :39.50 Median :0.0000 Median :39.00 Median :169.5 Median : 72.00
Mean :39.50 Mean :0.4342 Mean :39.15 Mean :170.8 Mean : 72.53
3rd Qu.:58.75 3rd Qu.:1.0000 3rd Qu.:46.75 3rd Qu.:174.8 3rd Qu.: 78.00
Max. :78.00 Max. :1.0000 Max. :60.00 Max. :201.0 Max. :103.00
NA's :2
Diet weight6weeks
Min. :1.000 Min. : 53.00
1st Qu.:1.000 1st Qu.: 61.85
Median :2.000 Median : 68.95
Mean :2.038 Mean : 68.68
3rd Qu.:3.000 3rd Qu.: 73.83
Max. :3.000 Max. :103.00
diet = diet %>% mutate(gender = factor(gender)) %>% mutate(Diet = factor(Diet)) %>% arrange(Person) # on réordonne suivant l'identifiant
glimpse(diet)
Observations: 78
Variables: 7
$ Person [3m[38;5;246m<dbl>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19…
$ gender [3m[38;5;246m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Age [3m[38;5;246m<dbl>[39m[23m 22, 46, 55, 33, 50, 50, 37, 28, 28, 45, 60, 48, 41, 37, 39, 31, 4…
$ Height [3m[38;5;246m<dbl>[39m[23m 159, 192, 170, 171, 170, 201, 174, 176, 165, 165, 173, 156, 163, …
$ pre.weight [3m[38;5;246m<dbl>[39m[23m 58, 60, 64, 64, 65, 66, 67, 69, 70, 70, 72, 72, 72, 82, 71, 72, 7…
$ Diet [3m[38;5;246m<fct>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ weight6weeks [3m[38;5;246m<dbl>[39m[23m 54.2, 54.0, 63.3, 61.1, 62.2, 64.0, 65.0, 60.5, 68.1, 66.9, 70.5,…
Il y a 2 valeurs manquantes dans gender
summary(diet$gender)
0 1 NA's
43 33 2
ggplot(diet,aes(x=Height,color=gender)) + geom_density()
On impute les 2 valeurs manquantes
diet$gender[diet$Person==25] = 0
diet$gender[diet$Person==26] = 1
On définit le label
diet = diet %>% mutate(weight_loss = -pre.weight+weight6weeks)
Visualisation des dépendances entre le label et diet
et gender
#boxplot(weight_loss ~ Diet + gender, data=diet)
ggplot(diet, aes(x=Diet, y=weight_loss)) +
geom_boxplot()
ggplot(diet, aes(x=gender, y=weight_loss)) +
geom_boxplot()
ggplot(diet, aes(x=Diet, y=weight_loss, fill=gender)) +
geom_boxplot()
de même, numériquement
diet %>% group_by(Diet) %>% summarise(mean(weight_loss))
diet %>% group_by(gender) %>% summarise(mean(weight_loss))
diet %>% group_by(gender,Diet) %>% summarise(mean(weight_loss))
diet %>% group_by(Diet) %>% summarise(avg = mean(weight_loss))
On fait un premier modèle suivant diet
summary(fit_diet_v1)
Call:
lm(formula = weight_loss ~ Diet, data = diet)
Residuals:
Min 1Q Median 3Q Max
-5.7000 -1.6519 -0.1759 1.3815 5.1259
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.3000 0.4889 -6.750 2.72e-09 ***
Diet2 0.2741 0.6719 0.408 0.68449
Diet3 -1.8481 0.6719 -2.751 0.00745 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.395 on 75 degrees of freedom
Multiple R-squared: 0.1418, Adjusted R-squared: 0.1189
F-statistic: 6.197 on 2 and 75 DF, p-value: 0.003229
La même chose sans intercept.
ON vérifie l’hypothèse de normalité des erreurs et l’absence d’individus aberrants
plot(fit_diet_v2)
anova(fit_diet_v2)
Analysis of Variance Table
Response: weight_loss
Df Sum Sq Mean Sq F value Pr(>F)
Diet2 1 70.14 70.139 12.364 0.0007418 ***
Residuals 76 431.13 5.673
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
On crée une variable Diet2
puis on estime dans les modèles avec cette nouvelle variable (version avec et sans intercept)
diet = diet %>% mutate(Diet2 = (Diet != 3))
fit_diet_v2 = lm(weight_loss ~ Diet2, data = diet)
summary(fit_diet_v2)
Call:
lm(formula = weight_loss ~ Diet2, data = diet)
Residuals:
Min 1Q Median 3Q Max
-5.8451 -1.6252 -0.1485 1.4549 5.2549
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1481 0.4584 -11.231 < 2e-16 ***
Diet2TRUE 1.9932 0.5669 3.516 0.000742 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.382 on 76 degrees of freedom
Multiple R-squared: 0.1399, Adjusted R-squared: 0.1286
F-statistic: 12.36 on 1 and 76 DF, p-value: 0.0007418
fit_diet_v2bis = lm(weight_loss ~ Diet2 - 1, data = diet)
summary(fit_diet_v2bis)
Call:
lm(formula = weight_loss ~ Diet2 - 1, data = diet)
Residuals:
Min 1Q Median 3Q Max
-5.8451 -1.6252 -0.1485 1.4549 5.2549
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Diet2FALSE -5.1481 0.4584 -11.23 < 2e-16 ***
Diet2TRUE -3.1549 0.3335 -9.46 1.77e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.382 on 76 degrees of freedom
Multiple R-squared: 0.7394, Adjusted R-squared: 0.7325
F-statistic: 107.8 on 2 and 76 DF, p-value: < 2.2e-16
On vérifie que la perte de poids dépend de Diet2
anova(fit_diet_v2bis)
Analysis of Variance Table
Response: weight_loss
Df Sum Sq Mean Sq F value Pr(>F)
Diet2 2 1223.22 611.61 107.81 < 2.2e-16 ***
Residuals 76 431.13 5.67
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
On regarde maintenant la dépendance entre la perte de poids et gender
fit_diet_v3 = lm(weight_loss ~ gender -1, data = diet)
summary(fit_diet_v3)
Call:
lm(formula = weight_loss ~ gender - 1, data = diet)
Residuals:
Min 1Q Median 3Q Max
-5.1848 -1.7264 0.2041 1.6846 5.9930
Coefficients:
Estimate Std. Error t value Pr(>|t|)
gender0 -3.8930 0.3846 -10.123 1.30e-15 ***
gender1 -4.0152 0.4390 -9.146 8.83e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.522 on 74 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.7155, Adjusted R-squared: 0.7078
F-statistic: 93.06 on 2 and 74 DF, p-value: < 2.2e-16
On essaye dans un modèle d’anova à 2 facteurs avec effet croisé
fit_diet_v4 = lm(weight_loss ~ Diet2 * gender-1 , data = diet)
summary(fit_diet_v4)
Call:
lm(formula = weight_loss ~ Diet2 * gender - 1, data = diet)
Residuals:
Min 1Q Median 3Q Max
-5.6714 -1.2371 -0.1214 1.3548 5.2905
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Diet2FALSE -5.8800 0.5922 -9.928 4.00e-15 ***
Diet2TRUE -2.8286 0.4335 -6.525 8.19e-09 ***
gender1 1.6467 0.8884 1.854 0.0679 .
Diet2TRUE:gender1 -2.7086 1.1080 -2.445 0.0169 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.294 on 72 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.771, Adjusted R-squared: 0.7583
F-statistic: 60.61 on 4 and 72 DF, p-value: < 2.2e-16
Une autre façon de faire la même chose : estimer dans 2 modèles l’un pour les hommes, l’autre pour les femmes. Attention ici, comme on travaille sur des sous-données, certaines estimations seront moins précises que dans le modèle avec effets croisés.
diet_gender1 = diet %>% filter(gender == "1")
summary(lm(weight_loss ~Diet2-1,diet_gender1))
Call:
lm(formula = weight_loss ~ Diet2 - 1, data = diet_gender1)
Residuals:
Min 1Q Median 3Q Max
-5.1095 -1.4095 -0.0095 1.4905 5.2905
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Diet2FALSE -4.2333 0.7404 -5.718 2.74e-06 ***
Diet2TRUE -3.8905 0.5597 -6.952 8.47e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.565 on 31 degrees of freedom
Multiple R-squared: 0.7233, Adjusted R-squared: 0.7054
F-statistic: 40.51 on 2 and 31 DF, p-value: 2.25e-09
diet_gender0 = diet %>% filter(gender == "0")
summary(lm(weight_loss ~Diet2-1,diet_gender0))
Call:
lm(formula = weight_loss ~ Diet2 - 1, data = diet_gender0)
Residuals:
Min 1Q Median 3Q Max
-5.6714 -1.1200 -0.1714 0.9043 4.9800
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Diet2FALSE -5.8800 0.5333 -11.026 7.73e-14 ***
Diet2TRUE -2.8286 0.3903 -7.247 7.41e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.065 on 41 degrees of freedom
Multiple R-squared: 0.8094, Adjusted R-squared: 0.8001
F-statistic: 87.04 on 2 and 41 DF, p-value: 1.752e-15
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).