Quelques algorithmes de machine learning sont illustrés pour la problématique de détecter de spams.
La base de données initiale doit être un ensemble de mails. Le text-mining permet ensuite de nettoyer les textes afin de construire une matrix de caractéristiques. Les caractéristiques peuvent être de plusieurs types :
Concernant les mails, on peut également construire d’autres caractéristiques :
La construction de la matrice de caractéristiques est souvent appelé feature engineering en anglais. C’est une étape primordiale. Ici, nous allons utiliser une base de données connue en data-mining : spambase de George Forman, construit sur un ensemble de mails reçus chez Hewlett-Packard Labs.
On peut télécharger la base de données sur le site de UC Irvine Machine Learning Repository.
data=read.table("spambase.data",header=F,sep=",")
names=read.csv2("names.csv",header=F)
names
## V1
## 1 word_freq_make
## 2 word_freq_address
## 3 word_freq_all
## 4 word_freq_3d
## 5 word_freq_our
## 6 word_freq_over
## 7 word_freq_remove
## 8 word_freq_internet
## 9 word_freq_order
## 10 word_freq_mail
## 11 word_freq_receive
## 12 word_freq_will
## 13 word_freq_people
## 14 word_freq_report
## 15 word_freq_addresses
## 16 word_freq_free
## 17 word_freq_business
## 18 word_freq_email
## 19 word_freq_you
## 20 word_freq_credit
## 21 word_freq_your
## 22 word_freq_font
## 23 word_freq_000
## 24 word_freq_money
## 25 word_freq_hp
## 26 word_freq_hpl
## 27 word_freq_george
## 28 word_freq_650
## 29 word_freq_lab
## 30 word_freq_labs
## 31 word_freq_telnet
## 32 word_freq_857
## 33 word_freq_data
## 34 word_freq_415
## 35 word_freq_85
## 36 word_freq_technology
## 37 word_freq_1999
## 38 word_freq_parts
## 39 word_freq_pm
## 40 word_freq_direct
## 41 word_freq_cs
## 42 word_freq_meeting
## 43 word_freq_original
## 44 word_freq_project
## 45 word_freq_re
## 46 word_freq_edu
## 47 word_freq_table
## 48 word_freq_conference
## 49 char_freq_;
## 50 char_freq_(
## 51 char_freq_[
## 52 char_freq_!
## 53 char_freq_$
## 54 char_freq_#
## 55 capital_run_length_average
## 56 capital_run_length_longest
## 57 capital_run_length_total
colnames(data)=c(as.character(names[,1]),"class")
dim(data)
## [1] 4601 58
data$class=as.factor(as.character(data$class))
table(data$class)
##
## 0 1
## 2788 1813
ind=createDataPartition(data$class, times = 1,p = 0.3,list=FALSE)
a=data[ind,]
t=data[-ind,]
dim(a);dim(t)
## [1] 1381 58
## [1] 3220 58
formula=as.formula(class ~.)
fit.ct <- rpart(formula, a, method = "class")
# On affiche l'arbre
plot(fit.ct)
# On affiche l'arbre avec les valeurs
text(fit.ct)
Pour afficher un arbre plus esthétique :
fancyRpartPlot(fit.ct,main="",sub="")
mc.ct.a=confusionMatrix(predict(fit.ct,type="class"),a$class)
mc.ct.t=confusionMatrix(predict(fit.ct,newdata=t,type="class"),t$class)
mc.ct.a
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 812 87
## 1 25 457
##
## Accuracy : 0.9189
## 95% CI : (0.9032, 0.9328)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8267
## Mcnemar's Test P-Value : 8.216e-09
##
## Sensitivity : 0.9701
## Specificity : 0.8401
## Pos Pred Value : 0.9032
## Neg Pred Value : 0.9481
## Prevalence : 0.6061
## Detection Rate : 0.5880
## Detection Prevalence : 0.6510
## Balanced Accuracy : 0.9051
##
## 'Positive' Class : 0
##
mc.ct.t
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1843 214
## 1 108 1055
##
## Accuracy : 0.9
## 95% CI : (0.8891, 0.9102)
## No Information Rate : 0.6059
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7875
## Mcnemar's Test P-Value : 4.874e-09
##
## Sensitivity : 0.9446
## Specificity : 0.8314
## Pos Pred Value : 0.8960
## Neg Pred Value : 0.9071
## Prevalence : 0.6059
## Detection Rate : 0.5724
## Detection Prevalence : 0.6388
## Balanced Accuracy : 0.8880
##
## 'Positive' Class : 0
##
head(predict(fit.ct,type="prob"))
## 0 1
## 1 0.03724928 0.9627507
## 5 0.03724928 0.9627507
## 16 0.03724928 0.9627507
## 17 0.03724928 0.9627507
## 21 0.09523810 0.9047619
## 28 0.04545455 0.9545455
roc.ct=roc(a$class,predict(fit.ct,data=a,type="prob")[,1])
plot(roc.ct)
##
## Call:
## roc.default(response = a$class, predictor = predict(fit.ct, data = a, type = "prob")[, 1])
##
## Data: predict(fit.ct, data = a, type = "prob")[, 1] in 837 controls (a$class 0) > 544 cases (a$class 1).
## Area under the curve: 0.9148
Si on réduit la profondeur de l’arbre à 2 :
fit.ct=rpart(formula,a,method = "class",
control=rpart.control(maxdepth = 2))
fancyRpartPlot(fit.ct,main="",sub="")
fit.ct=rpart(formula,a,method = "class",
control=rpart.control(minsplit=3, cp=0.000))
fancyRpartPlot(fit.ct,main="",sub="")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
#summary(fit.ct)
printcp(fit.ct)
##
## Classification tree:
## rpart(formula = formula, data = a, method = "class", control = rpart.control(minsplit = 3,
## cp = 0))
##
## Variables actually used in tree construction:
## [1] capital_run_length_average capital_run_length_longest
## [3] capital_run_length_total char_freq_!
## [5] char_freq_$ char_freq_(
## [7] char_freq_; word_freq_1999
## [9] word_freq_650 word_freq_address
## [11] word_freq_addresses word_freq_all
## [13] word_freq_business word_freq_data
## [15] word_freq_edu word_freq_email
## [17] word_freq_font word_freq_free
## [19] word_freq_george word_freq_hp
## [21] word_freq_hpl word_freq_internet
## [23] word_freq_lab word_freq_mail
## [25] word_freq_make word_freq_money
## [27] word_freq_order word_freq_our
## [29] word_freq_over word_freq_people
## [31] word_freq_re word_freq_receive
## [33] word_freq_remove word_freq_report
## [35] word_freq_technology word_freq_will
## [37] word_freq_you word_freq_your
##
## Root node error: 544/1381 = 0.39392
##
## n= 1381
##
## CP nsplit rel error xerror xstd
## 1 0.47058824 0 1.0000000 1.00000 0.033378
## 2 0.08823529 1 0.5294118 0.57169 0.028535
## 3 0.06250000 2 0.4411765 0.49265 0.027016
## 4 0.03676471 3 0.3786765 0.41360 0.025228
## 5 0.03125000 4 0.3419118 0.37132 0.024140
## 6 0.02757353 5 0.3106618 0.36765 0.024041
## 7 0.02389706 6 0.2830882 0.35478 0.023686
## 8 0.01838235 7 0.2591912 0.31985 0.022669
## 9 0.01286765 8 0.2408088 0.30882 0.022330
## 10 0.01102941 9 0.2279412 0.29596 0.021923
## 11 0.00735294 11 0.2058824 0.29044 0.021744
## 12 0.00551471 13 0.1911765 0.27757 0.021318
## 13 0.00459559 17 0.1691176 0.26471 0.020877
## 14 0.00428922 19 0.1599265 0.26471 0.020877
## 15 0.00367647 22 0.1470588 0.25184 0.020421
## 16 0.00306373 31 0.1139706 0.25184 0.020421
## 17 0.00183824 36 0.0937500 0.23713 0.019879
## 18 0.00110294 64 0.0422794 0.24449 0.020153
## 19 0.00091912 70 0.0349265 0.27574 0.021256
## 20 0.00061275 94 0.0128676 0.27390 0.021194
## 21 0.00000000 100 0.0091912 0.27390 0.021194
plotcp(fit.ct)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
fit.svm=ksvm(class~.,a)
mc.svm.a=confusionMatrix(predict(fit.svm,type="response"),a$class)
mc.svm.t=confusionMatrix(predict(fit.svm,newdata=t,type="response"),t$class)
mc.svm.a
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 818 55
## 1 19 489
##
## Accuracy : 0.9464
## 95% CI : (0.9332, 0.9577)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8865
## Mcnemar's Test P-Value : 4.728e-05
##
## Sensitivity : 0.9773
## Specificity : 0.8989
## Pos Pred Value : 0.9370
## Neg Pred Value : 0.9626
## Prevalence : 0.6061
## Detection Rate : 0.5923
## Detection Prevalence : 0.6322
## Balanced Accuracy : 0.9381
##
## 'Positive' Class : 0
##
mc.svm.t
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1884 183
## 1 67 1086
##
## Accuracy : 0.9224
## 95% CI : (0.9126, 0.9314)
## No Information Rate : 0.6059
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8348
## Mcnemar's Test P-Value : 3.51e-13
##
## Sensitivity : 0.9657
## Specificity : 0.8558
## Pos Pred Value : 0.9115
## Neg Pred Value : 0.9419
## Prevalence : 0.6059
## Detection Rate : 0.5851
## Detection Prevalence : 0.6419
## Balanced Accuracy : 0.9107
##
## 'Positive' Class : 0
##
fit.glm=glm(class~.,family=binomial(link="logit"),a)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.glm)
##
## Call:
## glm(formula = class ~ ., family = binomial(link = "logit"), data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0461 -0.1816 0.0000 0.0536 3.0689
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.667e+00 2.681e-01 -6.219 5.02e-10 ***
## word_freq_make -1.598e+00 6.398e-01 -2.498 0.012505 *
## word_freq_address -1.901e-01 1.743e-01 -1.091 0.275321
## word_freq_all -3.904e-01 2.534e-01 -1.540 0.123446
## word_freq_3d 1.139e+00 2.055e+00 0.554 0.579406
## word_freq_our 2.662e-01 1.492e-01 1.784 0.074391 .
## word_freq_over 1.888e-01 2.997e-01 0.630 0.528703
## word_freq_remove 4.334e+00 1.026e+00 4.222 2.42e-05 ***
## word_freq_internet 1.375e+00 5.251e-01 2.619 0.008819 **
## word_freq_order 1.358e+00 7.427e-01 1.829 0.067410 .
## word_freq_mail 4.067e-02 9.903e-02 0.411 0.681282
## word_freq_receive 9.248e-02 5.786e-01 0.160 0.873022
## word_freq_will -2.429e-01 1.620e-01 -1.500 0.133677
## word_freq_people -2.806e-01 4.385e-01 -0.640 0.522283
## word_freq_report 1.677e-01 3.252e-01 0.516 0.606154
## word_freq_addresses 6.954e+00 2.494e+00 2.789 0.005291 **
## word_freq_free 7.099e-01 2.580e-01 2.751 0.005933 **
## word_freq_business 1.241e+00 4.856e-01 2.556 0.010595 *
## word_freq_email -6.420e-01 3.267e-01 -1.965 0.049374 *
## word_freq_you 1.139e-01 7.346e-02 1.551 0.120981
## word_freq_credit 1.079e+00 1.108e+00 0.974 0.329902
## word_freq_your 3.902e-01 1.188e-01 3.284 0.001022 **
## word_freq_font 3.418e-01 3.668e-01 0.932 0.351472
## word_freq_000 5.386e-01 5.683e-01 0.948 0.343269
## word_freq_money 1.660e+00 7.168e-01 2.316 0.020562 *
## word_freq_hp -2.794e+00 7.031e-01 -3.974 7.05e-05 ***
## word_freq_hpl -1.296e+00 8.223e-01 -1.576 0.114992
## word_freq_george -2.752e+00 1.081e+00 -2.545 0.010919 *
## word_freq_650 3.404e-01 4.928e-01 0.691 0.489640
## word_freq_lab -4.298e+00 4.102e+00 -1.048 0.294798
## word_freq_labs -7.642e+00 2.257e+00 -3.386 0.000709 ***
## word_freq_telnet -1.016e+02 1.514e+04 -0.007 0.994643
## word_freq_857 1.413e+02 2.716e+05 0.001 0.999585
## word_freq_data -7.883e-01 5.077e-01 -1.553 0.120511
## word_freq_415 -1.882e+02 2.712e+05 -0.001 0.999446
## word_freq_85 -3.316e+00 2.601e+00 -1.275 0.202393
## word_freq_technology 2.576e+00 7.485e-01 3.442 0.000577 ***
## word_freq_1999 3.105e-01 3.040e-01 1.021 0.307018
## word_freq_parts -5.515e-01 6.683e-01 -0.825 0.409214
## word_freq_pm 1.014e-01 5.861e-01 0.173 0.862708
## word_freq_direct -3.342e+00 1.716e+00 -1.947 0.051507 .
## word_freq_cs -6.807e+02 3.238e+04 -0.021 0.983231
## word_freq_meeting -3.188e+00 2.007e+00 -1.588 0.112217
## word_freq_original -7.882e-01 7.195e-01 -1.095 0.273332
## word_freq_project -1.927e+00 1.112e+00 -1.733 0.083152 .
## word_freq_re -1.404e+00 3.773e-01 -3.720 0.000199 ***
## word_freq_edu -1.012e+00 4.242e-01 -2.385 0.017091 *
## word_freq_table -1.602e+00 3.594e+00 -0.446 0.655879
## word_freq_conference -2.636e+00 1.735e+00 -1.519 0.128663
## `char_freq_;` -1.270e+00 9.520e-01 -1.334 0.182252
## `char_freq_(` 3.908e-01 5.124e-01 0.763 0.445689
## `char_freq_[` 8.695e-01 2.291e+00 0.380 0.704285
## `char_freq_!` 7.285e-01 2.652e-01 2.747 0.006011 **
## `char_freq_$` 7.878e+00 2.091e+00 3.767 0.000165 ***
## `char_freq_#` 2.692e-01 1.275e+00 0.211 0.832789
## capital_run_length_average -2.907e-02 2.921e-02 -0.995 0.319655
## capital_run_length_longest 2.046e-02 5.738e-03 3.567 0.000362 ***
## capital_run_length_total 7.981e-04 4.130e-04 1.933 0.053272 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1851.83 on 1380 degrees of freedom
## Residual deviance: 488.82 on 1323 degrees of freedom
## AIC: 604.82
##
## Number of Fisher Scoring iterations: 23
mc.glm.a=confusionMatrix(
ifelse(predict(fit.glm,type="response")>0.5,1,0),
a$class)
mc.glm.t=confusionMatrix(
ifelse(predict(fit.glm,newdata=t,type="response")>0.5,1,0),
t$class)
mc.glm.a
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 798 64
## 1 39 480
##
## Accuracy : 0.9254
## 95% CI : (0.9103, 0.9387)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8425
## Mcnemar's Test P-Value : 0.01804
##
## Sensitivity : 0.9534
## Specificity : 0.8824
## Pos Pred Value : 0.9258
## Neg Pred Value : 0.9249
## Prevalence : 0.6061
## Detection Rate : 0.5778
## Detection Prevalence : 0.6242
## Balanced Accuracy : 0.9179
##
## 'Positive' Class : 0
##
mc.glm.t
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1841 158
## 1 110 1111
##
## Accuracy : 0.9168
## 95% CI : (0.9067, 0.9261)
## No Information Rate : 0.6059
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8246
## Mcnemar's Test P-Value : 0.004092
##
## Sensitivity : 0.9436
## Specificity : 0.8755
## Pos Pred Value : 0.9210
## Neg Pred Value : 0.9099
## Prevalence : 0.6059
## Detection Rate : 0.5717
## Detection Prevalence : 0.6208
## Balanced Accuracy : 0.9096
##
## 'Positive' Class : 0
##
fit.lda=lda(class~.,data=a)
mc.lda.a=confusionMatrix(predict(fit.lda,data=a)$class,a$class)
mc.lda.t=confusionMatrix(predict(fit.lda,newdata=t)$class,t$class)
On peut normaliser les valeurs à l’aide la fonction suivante :
normalize <- function(x) {
norm <- ((x - min(x))/(max(x) - min(x)))
return (norm)
}
datan <- as.data.frame(lapply(data[,1:57], normalize))
Ainsi toutes les variables prennent des valeurs de 0 à 1 :
data_n=cbind(datan,class=data[,58])
Pour tester l’algorithme, on répartit les données en deux bases : base d’apprentissage et base de test.
a_n=data_n[ind,]
t_n=data_n[-ind,]
On peut faire la prédiction des classes de la base de test à l’aide la fonction knn
du package class
. On peut choisir dans un premier temps k=5
.
pred.a <- knn(train = a_n[,1:57], test = a_n[,1:57],
cl = a_n$class, k=5)
pred.t <- knn(train = a_n[,1:57], test = t_n[,1:57],
cl = a_n$class, k=5)
ou utiliser la fonction confusionMatrix
qui permet de calculer automatiquement certains caractéristiques :
mc.knn.a=(confusionMatrix(pred.a,a_n$class))
mc.knn.t=(confusionMatrix(pred.t,t_n$class))
fit.nb <- naiveBayes(class ~ ., data = a)
mc.nb.a=confusionMatrix(predict(fit.nb,a),a$class)
mc.nb.t=confusionMatrix(predict(fit.nb,newdata=t),t$class)
df=as.data.frame(rbind(c("ct","apprentissage",mc.ct.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("ct","test",mc.ct.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("svm","apprentissage",mc.svm.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("svm","test",mc.svm.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("glm","apprentissage",mc.glm.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("glm","test",mc.glm.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("lda","apprentissage",mc.lda.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("lda","test",mc.knn.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("knn","apprentissage",mc.lda.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("knn","test",mc.knn.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("nb","apprentissage",mc.nb.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("nb","test",mc.nb.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")])
))
colnames(df)[1:2]=c("algo","type")
df[,"AccuracyLower"]=as.numeric(as.character(df[,"AccuracyLower"]))
df[,"Accuracy"]=as.numeric(as.character(df[,"Accuracy"]))
df[,"AccuracyUpper"]=as.numeric(as.character(df[,"AccuracyUpper"]))
p <- ggplot(df, aes(algo, Accuracy, colour = type))
p + geom_crossbar(aes(ymin = AccuracyLower, ymax = AccuracyUpper))
fit.bag=bagging(class ~.,a,nbagg=60)
mc.bag.a=confusionMatrix(predict(fit.bag,data=a),a$class)
mc.bag.t=confusionMatrix(predict(fit.bag,newdata=t),t$class)
colnames(a)[49:54]=paste0("v_",seq(1:6))
colnames(t)[49:54]=paste0("v_",seq(1:6))
fit.rf=randomForest(class~.,data=a)
mc.rf.a=confusionMatrix(predict(fit.rf,data=a,type="class"),a$class)
mc.rf.t=confusionMatrix(predict(fit.rf,newdata=t,type="class"),t$class)
df2=as.data.frame(rbind(c("bagging","apprentissage",mc.bag.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("bagging","test",mc.bag.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("rf","apprentissage",mc.rf.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("rf","test",mc.rf.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")])))
df2[,"AccuracyLower"]=as.numeric(as.character(df2[,"AccuracyLower"]))
df2[,"Accuracy"]=as.numeric(as.character(df2[,"Accuracy"]))
df2[,"AccuracyUpper"]=as.numeric(as.character(df2[,"AccuracyUpper"]))
colnames(df2)[1:2]=c("algo","type")
p <- ggplot(rbind(df,df2), aes(algo, Accuracy, colour = type))
p + geom_crossbar(aes(ymin = AccuracyLower, ymax = AccuracyUpper))