1 Introduction

Quelques algorithmes de machine learning sont illustrés pour la problématique de détecter de spams.

La base de données initiale doit être un ensemble de mails. Le text-mining permet ensuite de nettoyer les textes afin de construire une matrix de caractéristiques. Les caractéristiques peuvent être de plusieurs types :

Concernant les mails, on peut également construire d’autres caractéristiques :

La construction de la matrice de caractéristiques est souvent appelé feature engineering en anglais. C’est une étape primordiale. Ici, nous allons utiliser une base de données connue en data-mining : spambase de George Forman, construit sur un ensemble de mails reçus chez Hewlett-Packard Labs.

2 Exploration de la base de données

On peut télécharger la base de données sur le site de UC Irvine Machine Learning Repository.

data=read.table("spambase.data",header=F,sep=",")
names=read.csv2("names.csv",header=F)
names
##                            V1
## 1              word_freq_make
## 2           word_freq_address
## 3               word_freq_all
## 4                word_freq_3d
## 5               word_freq_our
## 6              word_freq_over
## 7            word_freq_remove
## 8          word_freq_internet
## 9             word_freq_order
## 10             word_freq_mail
## 11          word_freq_receive
## 12             word_freq_will
## 13           word_freq_people
## 14           word_freq_report
## 15        word_freq_addresses
## 16             word_freq_free
## 17         word_freq_business
## 18            word_freq_email
## 19              word_freq_you
## 20           word_freq_credit
## 21             word_freq_your
## 22             word_freq_font
## 23              word_freq_000
## 24            word_freq_money
## 25               word_freq_hp
## 26              word_freq_hpl
## 27           word_freq_george
## 28              word_freq_650
## 29              word_freq_lab
## 30             word_freq_labs
## 31           word_freq_telnet
## 32              word_freq_857
## 33             word_freq_data
## 34              word_freq_415
## 35               word_freq_85
## 36       word_freq_technology
## 37             word_freq_1999
## 38            word_freq_parts
## 39               word_freq_pm
## 40           word_freq_direct
## 41               word_freq_cs
## 42          word_freq_meeting
## 43         word_freq_original
## 44          word_freq_project
## 45               word_freq_re
## 46              word_freq_edu
## 47            word_freq_table
## 48       word_freq_conference
## 49                char_freq_;
## 50                char_freq_(
## 51                char_freq_[
## 52                char_freq_!
## 53                char_freq_$
## 54                char_freq_#
## 55 capital_run_length_average
## 56 capital_run_length_longest
## 57   capital_run_length_total
colnames(data)=c(as.character(names[,1]),"class")
dim(data)
## [1] 4601   58
data$class=as.factor(as.character(data$class))

table(data$class)
## 
##    0    1 
## 2788 1813

3 Construction des modèles

3.1 Base d’apprentissage

ind=createDataPartition(data$class, times = 1,p = 0.3,list=FALSE)
a=data[ind,]
t=data[-ind,]
dim(a);dim(t)
## [1] 1381   58
## [1] 3220   58

3.2 Arbres de classification

3.2.1 Apprentissage de l’arbre de décision

formula=as.formula(class ~.)

fit.ct <- rpart(formula, a, method = "class")

# On affiche l'arbre
plot(fit.ct) 
# On affiche l'arbre avec les valeurs
text(fit.ct)

Pour afficher un arbre plus esthétique :

fancyRpartPlot(fit.ct,main="",sub="")

3.2.2 Matrice de confusion

mc.ct.a=confusionMatrix(predict(fit.ct,type="class"),a$class)
mc.ct.t=confusionMatrix(predict(fit.ct,newdata=t,type="class"),t$class)
mc.ct.a
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 812  87
##          1  25 457
##                                           
##                Accuracy : 0.9189          
##                  95% CI : (0.9032, 0.9328)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8267          
##  Mcnemar's Test P-Value : 8.216e-09       
##                                           
##             Sensitivity : 0.9701          
##             Specificity : 0.8401          
##          Pos Pred Value : 0.9032          
##          Neg Pred Value : 0.9481          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5880          
##    Detection Prevalence : 0.6510          
##       Balanced Accuracy : 0.9051          
##                                           
##        'Positive' Class : 0               
## 
mc.ct.t
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1843  214
##          1  108 1055
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8891, 0.9102)
##     No Information Rate : 0.6059          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7875          
##  Mcnemar's Test P-Value : 4.874e-09       
##                                           
##             Sensitivity : 0.9446          
##             Specificity : 0.8314          
##          Pos Pred Value : 0.8960          
##          Neg Pred Value : 0.9071          
##              Prevalence : 0.6059          
##          Detection Rate : 0.5724          
##    Detection Prevalence : 0.6388          
##       Balanced Accuracy : 0.8880          
##                                           
##        'Positive' Class : 0               
## 

3.2.3 Prédiction des résultats en terme de probabilité

head(predict(fit.ct,type="prob"))
##             0         1
## 1  0.03724928 0.9627507
## 5  0.03724928 0.9627507
## 16 0.03724928 0.9627507
## 17 0.03724928 0.9627507
## 21 0.09523810 0.9047619
## 28 0.04545455 0.9545455
roc.ct=roc(a$class,predict(fit.ct,data=a,type="prob")[,1])
plot(roc.ct)

## 
## Call:
## roc.default(response = a$class, predictor = predict(fit.ct, data = a,     type = "prob")[, 1])
## 
## Data: predict(fit.ct, data = a, type = "prob")[, 1] in 837 controls (a$class 0) > 544 cases (a$class 1).
## Area under the curve: 0.9148

3.2.4 Optimisation

Si on réduit la profondeur de l’arbre à 2 :

fit.ct=rpart(formula,a,method = "class",
             control=rpart.control(maxdepth = 2))
fancyRpartPlot(fit.ct,main="",sub="")

fit.ct=rpart(formula,a,method = "class",
              control=rpart.control(minsplit=3, cp=0.000))
fancyRpartPlot(fit.ct,main="",sub="")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

#summary(fit.ct)
printcp(fit.ct)
## 
## Classification tree:
## rpart(formula = formula, data = a, method = "class", control = rpart.control(minsplit = 3, 
##     cp = 0))
## 
## Variables actually used in tree construction:
##  [1] capital_run_length_average capital_run_length_longest
##  [3] capital_run_length_total   char_freq_!               
##  [5] char_freq_$                char_freq_(               
##  [7] char_freq_;                word_freq_1999            
##  [9] word_freq_650              word_freq_address         
## [11] word_freq_addresses        word_freq_all             
## [13] word_freq_business         word_freq_data            
## [15] word_freq_edu              word_freq_email           
## [17] word_freq_font             word_freq_free            
## [19] word_freq_george           word_freq_hp              
## [21] word_freq_hpl              word_freq_internet        
## [23] word_freq_lab              word_freq_mail            
## [25] word_freq_make             word_freq_money           
## [27] word_freq_order            word_freq_our             
## [29] word_freq_over             word_freq_people          
## [31] word_freq_re               word_freq_receive         
## [33] word_freq_remove           word_freq_report          
## [35] word_freq_technology       word_freq_will            
## [37] word_freq_you              word_freq_your            
## 
## Root node error: 544/1381 = 0.39392
## 
## n= 1381 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.47058824      0 1.0000000 1.00000 0.033378
## 2  0.08823529      1 0.5294118 0.57169 0.028535
## 3  0.06250000      2 0.4411765 0.49265 0.027016
## 4  0.03676471      3 0.3786765 0.41360 0.025228
## 5  0.03125000      4 0.3419118 0.37132 0.024140
## 6  0.02757353      5 0.3106618 0.36765 0.024041
## 7  0.02389706      6 0.2830882 0.35478 0.023686
## 8  0.01838235      7 0.2591912 0.31985 0.022669
## 9  0.01286765      8 0.2408088 0.30882 0.022330
## 10 0.01102941      9 0.2279412 0.29596 0.021923
## 11 0.00735294     11 0.2058824 0.29044 0.021744
## 12 0.00551471     13 0.1911765 0.27757 0.021318
## 13 0.00459559     17 0.1691176 0.26471 0.020877
## 14 0.00428922     19 0.1599265 0.26471 0.020877
## 15 0.00367647     22 0.1470588 0.25184 0.020421
## 16 0.00306373     31 0.1139706 0.25184 0.020421
## 17 0.00183824     36 0.0937500 0.23713 0.019879
## 18 0.00110294     64 0.0422794 0.24449 0.020153
## 19 0.00091912     70 0.0349265 0.27574 0.021256
## 20 0.00061275     94 0.0128676 0.27390 0.021194
## 21 0.00000000    100 0.0091912 0.27390 0.021194
plotcp(fit.ct)

3.3 SVM

3.3.1 Apprentissage

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
fit.svm=ksvm(class~.,a)

3.3.2 Prédictions

mc.svm.a=confusionMatrix(predict(fit.svm,type="response"),a$class)
mc.svm.t=confusionMatrix(predict(fit.svm,newdata=t,type="response"),t$class)

mc.svm.a
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 818  55
##          1  19 489
##                                           
##                Accuracy : 0.9464          
##                  95% CI : (0.9332, 0.9577)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8865          
##  Mcnemar's Test P-Value : 4.728e-05       
##                                           
##             Sensitivity : 0.9773          
##             Specificity : 0.8989          
##          Pos Pred Value : 0.9370          
##          Neg Pred Value : 0.9626          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5923          
##    Detection Prevalence : 0.6322          
##       Balanced Accuracy : 0.9381          
##                                           
##        'Positive' Class : 0               
## 
mc.svm.t
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1884  183
##          1   67 1086
##                                           
##                Accuracy : 0.9224          
##                  95% CI : (0.9126, 0.9314)
##     No Information Rate : 0.6059          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8348          
##  Mcnemar's Test P-Value : 3.51e-13        
##                                           
##             Sensitivity : 0.9657          
##             Specificity : 0.8558          
##          Pos Pred Value : 0.9115          
##          Neg Pred Value : 0.9419          
##              Prevalence : 0.6059          
##          Detection Rate : 0.5851          
##    Detection Prevalence : 0.6419          
##       Balanced Accuracy : 0.9107          
##                                           
##        'Positive' Class : 0               
## 

3.4 Regression Logistique

3.4.1 Apprentissage

fit.glm=glm(class~.,family=binomial(link="logit"),a)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.glm)
## 
## Call:
## glm(formula = class ~ ., family = binomial(link = "logit"), data = a)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0461  -0.1816   0.0000   0.0536   3.0689  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.667e+00  2.681e-01  -6.219 5.02e-10 ***
## word_freq_make             -1.598e+00  6.398e-01  -2.498 0.012505 *  
## word_freq_address          -1.901e-01  1.743e-01  -1.091 0.275321    
## word_freq_all              -3.904e-01  2.534e-01  -1.540 0.123446    
## word_freq_3d                1.139e+00  2.055e+00   0.554 0.579406    
## word_freq_our               2.662e-01  1.492e-01   1.784 0.074391 .  
## word_freq_over              1.888e-01  2.997e-01   0.630 0.528703    
## word_freq_remove            4.334e+00  1.026e+00   4.222 2.42e-05 ***
## word_freq_internet          1.375e+00  5.251e-01   2.619 0.008819 ** 
## word_freq_order             1.358e+00  7.427e-01   1.829 0.067410 .  
## word_freq_mail              4.067e-02  9.903e-02   0.411 0.681282    
## word_freq_receive           9.248e-02  5.786e-01   0.160 0.873022    
## word_freq_will             -2.429e-01  1.620e-01  -1.500 0.133677    
## word_freq_people           -2.806e-01  4.385e-01  -0.640 0.522283    
## word_freq_report            1.677e-01  3.252e-01   0.516 0.606154    
## word_freq_addresses         6.954e+00  2.494e+00   2.789 0.005291 ** 
## word_freq_free              7.099e-01  2.580e-01   2.751 0.005933 ** 
## word_freq_business          1.241e+00  4.856e-01   2.556 0.010595 *  
## word_freq_email            -6.420e-01  3.267e-01  -1.965 0.049374 *  
## word_freq_you               1.139e-01  7.346e-02   1.551 0.120981    
## word_freq_credit            1.079e+00  1.108e+00   0.974 0.329902    
## word_freq_your              3.902e-01  1.188e-01   3.284 0.001022 ** 
## word_freq_font              3.418e-01  3.668e-01   0.932 0.351472    
## word_freq_000               5.386e-01  5.683e-01   0.948 0.343269    
## word_freq_money             1.660e+00  7.168e-01   2.316 0.020562 *  
## word_freq_hp               -2.794e+00  7.031e-01  -3.974 7.05e-05 ***
## word_freq_hpl              -1.296e+00  8.223e-01  -1.576 0.114992    
## word_freq_george           -2.752e+00  1.081e+00  -2.545 0.010919 *  
## word_freq_650               3.404e-01  4.928e-01   0.691 0.489640    
## word_freq_lab              -4.298e+00  4.102e+00  -1.048 0.294798    
## word_freq_labs             -7.642e+00  2.257e+00  -3.386 0.000709 ***
## word_freq_telnet           -1.016e+02  1.514e+04  -0.007 0.994643    
## word_freq_857               1.413e+02  2.716e+05   0.001 0.999585    
## word_freq_data             -7.883e-01  5.077e-01  -1.553 0.120511    
## word_freq_415              -1.882e+02  2.712e+05  -0.001 0.999446    
## word_freq_85               -3.316e+00  2.601e+00  -1.275 0.202393    
## word_freq_technology        2.576e+00  7.485e-01   3.442 0.000577 ***
## word_freq_1999              3.105e-01  3.040e-01   1.021 0.307018    
## word_freq_parts            -5.515e-01  6.683e-01  -0.825 0.409214    
## word_freq_pm                1.014e-01  5.861e-01   0.173 0.862708    
## word_freq_direct           -3.342e+00  1.716e+00  -1.947 0.051507 .  
## word_freq_cs               -6.807e+02  3.238e+04  -0.021 0.983231    
## word_freq_meeting          -3.188e+00  2.007e+00  -1.588 0.112217    
## word_freq_original         -7.882e-01  7.195e-01  -1.095 0.273332    
## word_freq_project          -1.927e+00  1.112e+00  -1.733 0.083152 .  
## word_freq_re               -1.404e+00  3.773e-01  -3.720 0.000199 ***
## word_freq_edu              -1.012e+00  4.242e-01  -2.385 0.017091 *  
## word_freq_table            -1.602e+00  3.594e+00  -0.446 0.655879    
## word_freq_conference       -2.636e+00  1.735e+00  -1.519 0.128663    
## `char_freq_;`              -1.270e+00  9.520e-01  -1.334 0.182252    
## `char_freq_(`               3.908e-01  5.124e-01   0.763 0.445689    
## `char_freq_[`               8.695e-01  2.291e+00   0.380 0.704285    
## `char_freq_!`               7.285e-01  2.652e-01   2.747 0.006011 ** 
## `char_freq_$`               7.878e+00  2.091e+00   3.767 0.000165 ***
## `char_freq_#`               2.692e-01  1.275e+00   0.211 0.832789    
## capital_run_length_average -2.907e-02  2.921e-02  -0.995 0.319655    
## capital_run_length_longest  2.046e-02  5.738e-03   3.567 0.000362 ***
## capital_run_length_total    7.981e-04  4.130e-04   1.933 0.053272 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1851.83  on 1380  degrees of freedom
## Residual deviance:  488.82  on 1323  degrees of freedom
## AIC: 604.82
## 
## Number of Fisher Scoring iterations: 23

3.4.2 Prédictions

mc.glm.a=confusionMatrix(
  ifelse(predict(fit.glm,type="response")>0.5,1,0),
  a$class)
mc.glm.t=confusionMatrix(
  ifelse(predict(fit.glm,newdata=t,type="response")>0.5,1,0),
  t$class)

mc.glm.a
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 798  64
##          1  39 480
##                                           
##                Accuracy : 0.9254          
##                  95% CI : (0.9103, 0.9387)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8425          
##  Mcnemar's Test P-Value : 0.01804         
##                                           
##             Sensitivity : 0.9534          
##             Specificity : 0.8824          
##          Pos Pred Value : 0.9258          
##          Neg Pred Value : 0.9249          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5778          
##    Detection Prevalence : 0.6242          
##       Balanced Accuracy : 0.9179          
##                                           
##        'Positive' Class : 0               
## 
mc.glm.t
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1841  158
##          1  110 1111
##                                           
##                Accuracy : 0.9168          
##                  95% CI : (0.9067, 0.9261)
##     No Information Rate : 0.6059          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8246          
##  Mcnemar's Test P-Value : 0.004092        
##                                           
##             Sensitivity : 0.9436          
##             Specificity : 0.8755          
##          Pos Pred Value : 0.9210          
##          Neg Pred Value : 0.9099          
##              Prevalence : 0.6059          
##          Detection Rate : 0.5717          
##    Detection Prevalence : 0.6208          
##       Balanced Accuracy : 0.9096          
##                                           
##        'Positive' Class : 0               
## 

3.5 Analyse Discriminante

fit.lda=lda(class~.,data=a)
mc.lda.a=confusionMatrix(predict(fit.lda,data=a)$class,a$class)
mc.lda.t=confusionMatrix(predict(fit.lda,newdata=t)$class,t$class)

3.6 kNN

3.6.1 Normalisation

On peut normaliser les valeurs à l’aide la fonction suivante :

normalize <- function(x) {
    norm <- ((x - min(x))/(max(x) - min(x)))
    return (norm)
}

datan <- as.data.frame(lapply(data[,1:57], normalize))

Ainsi toutes les variables prennent des valeurs de 0 à 1 :

data_n=cbind(datan,class=data[,58])

3.6.2 Création de partition

Pour tester l’algorithme, on répartit les données en deux bases : base d’apprentissage et base de test.

a_n=data_n[ind,]
t_n=data_n[-ind,]

3.6.3 Prédiction

On peut faire la prédiction des classes de la base de test à l’aide la fonction knn du package class. On peut choisir dans un premier temps k=5.

pred.a <- knn(train = a_n[,1:57], test = a_n[,1:57],
 cl = a_n$class, k=5)
pred.t <- knn(train = a_n[,1:57], test = t_n[,1:57],
 cl = a_n$class, k=5)

ou utiliser la fonction confusionMatrix qui permet de calculer automatiquement certains caractéristiques :

mc.knn.a=(confusionMatrix(pred.a,a_n$class))
mc.knn.t=(confusionMatrix(pred.t,t_n$class))

3.7 Classification Naïve bayésienne

fit.nb <- naiveBayes(class ~ ., data = a)
 
mc.nb.a=confusionMatrix(predict(fit.nb,a),a$class)
mc.nb.t=confusionMatrix(predict(fit.nb,newdata=t),t$class)

4 Synthèse des modèles

df=as.data.frame(rbind(c("ct","apprentissage",mc.ct.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("ct","test",mc.ct.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),

c("svm","apprentissage",mc.svm.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("svm","test",mc.svm.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),

c("glm","apprentissage",mc.glm.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("glm","test",mc.glm.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),

c("lda","apprentissage",mc.lda.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("lda","test",mc.knn.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("knn","apprentissage",mc.lda.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("knn","test",mc.knn.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("nb","apprentissage",mc.nb.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("nb","test",mc.nb.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")])
))

colnames(df)[1:2]=c("algo","type")
df[,"AccuracyLower"]=as.numeric(as.character(df[,"AccuracyLower"]))
df[,"Accuracy"]=as.numeric(as.character(df[,"Accuracy"]))
df[,"AccuracyUpper"]=as.numeric(as.character(df[,"AccuracyUpper"]))

p <- ggplot(df, aes(algo, Accuracy, colour = type))
p + geom_crossbar(aes(ymin = AccuracyLower, ymax = AccuracyUpper))

5 Agrégation de modèles

5.1 Bagging

fit.bag=bagging(class ~.,a,nbagg=60)
mc.bag.a=confusionMatrix(predict(fit.bag,data=a),a$class)
mc.bag.t=confusionMatrix(predict(fit.bag,newdata=t),t$class)

5.2 Foret Aleatoire

colnames(a)[49:54]=paste0("v_",seq(1:6))
colnames(t)[49:54]=paste0("v_",seq(1:6))

fit.rf=randomForest(class~.,data=a)

mc.rf.a=confusionMatrix(predict(fit.rf,data=a,type="class"),a$class)
mc.rf.t=confusionMatrix(predict(fit.rf,newdata=t,type="class"),t$class)

6 Synthèse Finale

df2=as.data.frame(rbind(c("bagging","apprentissage",mc.bag.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("bagging","test",mc.bag.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),

c("rf","apprentissage",mc.rf.a$overall[c("AccuracyLower","Accuracy","AccuracyUpper")]),
c("rf","test",mc.rf.t$overall[c("AccuracyLower","Accuracy","AccuracyUpper")])))

df2[,"AccuracyLower"]=as.numeric(as.character(df2[,"AccuracyLower"]))
df2[,"Accuracy"]=as.numeric(as.character(df2[,"Accuracy"]))
df2[,"AccuracyUpper"]=as.numeric(as.character(df2[,"AccuracyUpper"]))

colnames(df2)[1:2]=c("algo","type")

p <- ggplot(rbind(df,df2), aes(algo, Accuracy, colour = type))
p + geom_crossbar(aes(ymin = AccuracyLower, ymax = AccuracyUpper))