-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode.Rmd
123 lines (102 loc) · 2.75 KB
/
Code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
```{r}
library(tidyverse)
library(ranger)
```
```{r}
##Read the parsed data from python
data<-read.csv("/Users/dishadh/Downloads/HDFS_1/newlog.csv")
```
```{r}
#Change date format as date and time are different combining then
data$date <- paste(data$Date, " " ,data$Time)
data[['date']] <- as.POSIXct(data[['date']],
format = "%Y-%m-%d %H:%M:%S")
data <- subset (data, select = -c(Date,Time))
```
```{r}
#Drop rows with NA
data<-data[!grepl("#N/A", data$Response),]
```
```{r}
unique(data$Response)
```
```{r}
#Group the data by date, block id, process id and response
gdata <- data %>% group_by(date,Block_ID,PID,Response)
```
```{r}
#Factorizing the Response variable
gdata$Response[(gdata$Response == "Normal") ]<- 0
gdata$Response[(gdata$Response == "Anomaly") ]<- 1
gdata$Response <- as.factor(gdata$Response)
```
```{r}
#Divide the data into training and testing set
dt = sort(sample(nrow(gdata), nrow(gdata)*.7))
gdata.train <- gdata[dt, ]
gdata.test <- gdata[-dt, ]
Response <- gdata.test$Response
gdata.test <-subset(gdata.test, select=-Response)
```
```{r}
#Fit random forest
rf <- ranger(Response ~ ., data = gdata.train, write.forest = TRUE,mtry=3, importance = 'impurity')
```
```{r}
#Print training error and accuracy
cat("The training prediction error is: ", rf$prediction.error, "for",rf$num.trees," trees","\n")
cat("The confusion matrix for traiing model is: \n")
rf$confusion.matrix
#Output of the random forest
rf$forest
```
```{r}
#Predict the response variable for the test set
pred <- predict(rf, data = gdata.test)
```
```{r}
#Print confusion matrix for test set
cat("The confusion matrix for the test set is: ")
rf$confusion.matrix
```
```{r}
#Accuracy of training set
rf$variable.importance
```
```{r}
#Feature importance
ranger::importance(rf)
```
```{r}
#Delete the columns which hae less importance
data2 <- subset(gdata, select=-c(root,succeeded))
```
```{r}
#Divide data to training and testing set
dt2 = sort(sample(nrow(data2), nrow(data2)*.7))
data2.train <- data2[dt2, ]
data2.test <- data2[-dt2, ]
Response2 <- data2.test$Response
data2.test <-subset(data2.test, select=-Response)
```
```{r}
#Fit random forest
rf2 <- ranger(Response ~ ., data = data2.train, write.forest = TRUE,mtry=3, importance = 'impurity')
```
```{r}
#Calculate the response variable predictions for the test set
pred2 <- predict(rf2, data = data2.test)
```
```{r}
#Print training error and accuracy
cat("The training prediction error is: ", rf2$prediction.error, "for",rf2$num.trees," trees","\n")
cat("The confusion matrix for traiing model is: \n")
rf2$confusion.matrix
#Output of the random forest
rf2
```
```{r}
#Print confusion matrix for test set
cat("The confusion matrix for the test set is: ")
table(Response2, predictions(pred2))
```