### EPI 2024 for Data Analytics ### setwd("~/Courses/Data Analytics/Fall24/labs/lab03/") ### read in data epi2024results06022024 <- read.csv("epi2024results06022024.csv", header=TRUE) # epi2024weights <- read.csv("epi2024weights.csv") ## copy variables epi <- epi2024results06022024 attach(epi) ## NA values na.indexes <- is.na(MHP.new) ## drop NAs epi.subset <- epi[!na.indexes,] # ## convert to numeric # EPI.new <- as.numeric(EPI.new) ## summary stats summary(epi.subset$MHP.new) ## histograms hist(MHP.new) hist(MHP.new, seq(0., 100., 5.0), prob=TRUE) rug(MHP.new) lines(density(MHP.new,na.rm=TRUE,bw=1)) lines(density(MHP.new,na.rm=TRUE,bw="SJ")) x <- seq(0., 100., 1.0) plot(dnorm(x, mean=42, sd=10)) plot(pnorm(x, mean=42, sd=10)) qnorm(.50, mean=42, sd=10) rnorm(1000, mean=42, sd=10) hist(EPI.new) hist(EPI.new, seq(20., 80., 5.0), prob=TRUE) rug(EPI.new) lines(density(EPI.new,na.rm=TRUE,bw=1)) lines(density(EPI.new,na.rm=TRUE,bw="SJ")) x <- seq(20., 80., 1.0) plot(qqnorm(EPI.new)) qqline(EPI.new) plot(qqnorm(EPI.new[which(EPI.new<=58)])) qqline(EPI.new[which(EPI.new<=58)]) ################ hist(ECO.new) summary(ECO.new) hist(ECO.new, seq(20., 90., 2.0), prob=TRUE) rug(ECO.new) lines(density(ECO.new,na.rm=TRUE,bw=1)) lines(density(ECO.new,na.rm=TRUE,bw="SJ")) ################ hist(APO.new) summary(APO.new) boxplot(APO.new) APO.new.high <- APO.new[APO.new>35] h0 <- hist(APO.new, seq(0., 100., 1.0), freq=FALSE) h1 <- hist(APO.new.high, seq(0., 100., 1.0), freq=FALSE) h1$density rug(APO.new) lines(density(APO.new,na.rm=TRUE,bw=1)) lines(density(APO.new,na.rm=TRUE,bw="SJ")) x <- seq(5., 90., 5.0) qn<- dnorm(x,mean=45, sd=10,log=FALSE) lines(x,qn) # lines(x,0.4*qn) qn<- dnorm(x,mean=65, sd=20,log=FALSE) lines(x,qn) # lines(x,0.12*qn) ######################################################################## ##### Naive Bayes ##### library("e1071") library("ggplot2") ## read data abalone <- read.csv("~/Courses/Data Analytics/Fall24/labs/Lab02_2/abalone/abalone.data", header=FALSE) ## rename columns colnames(abalone) <- c("sex", "length", 'diameter', 'height', 'whole_weight', 'shucked_wieght', 'viscera_wieght', 'shell_weight', 'rings' ) ## derive age group based in number of rings abalone$age.group <- cut(abalone$rings, br=c(-1,8,11,35), labels = c("young", 'adult', 'old')) ## alternative way abalone$age.group[abalone$rings<=8] <- "young" abalone$age.group[abalone$rings>8 & abalone$rings<=11] <- "adult" abalone$age.group[abalone$rings>11] <- "old" ## convert age group from character to factor abalone$age.group <- as.factor(abalone$age.group) ## drop sex and number of rings abalone <- abalone[,-c(1,9)] # train classifier using all data classifier<-naiveBayes(abalone[,1:7], abalone[,8]) ## predict classes prediction <- predict(classifier, abalone[,1:7]) ## evaluate prediction contingency.table <- table(prediction, abalone[,8], dnn=list('predicted','actual')) print(contingency.table) ## plot whole_weight using means and SDs from model parameters <- classifier$tables$whole_weight m1 <- parameters["young",1][[1]] m2 <- parameters["adult",1][[1]] m3 <- parameters["old",1][[1]] sd1 <- parameters["young",2][[1]] sd2 <- parameters["adult",2][[1]] sd3 <- parameters["old",2][[1]] plot(function(x) dnorm(x, m1, sd1), 0, 3, col="red", main="Petal length distribution for the 3 different species") curve(dnorm(x, m2, sd2), add=TRUE, col="blue") curve(dnorm(x, m3, sd3 ), add=TRUE, col = "green") contingency.matrix = as.matrix(contingency.table) sum(diag(contingency.matrix))/length(abalone[,8]) ##################################################################### ########## kNN ########### ## number of rows n = nrow(abalone) ## training set indexes train.indexes <- sample(n,n*.7) ## create training/test sets abalone.train <-abalone[train.indexes,] abalone.test <-abalone[-train.indexes,] sqrt(2924) k = 55 ## train knn model KNNpred <- knn(train = abalone.train[1:7], test = abalone.test[1:7], cl = abalone.train$age.group, k = k) ## evaluate contingency.table <- table(Actual=KNNpred, Predicted = abalone.test$age.group, dnn=list('predicted','actual')) print(contingency.table) contingency.matrix = as.matrix(contingency.table) sum(diag(contingency.matrix))/length(abalone.test$age.group) ## run text with multiple k values accuracy <- c() ks <- seq(5,105,10) for (k in ks) { KNNpred <- knn(train = abalone.train[1:7], test = abalone.test[1:7], cl = abalone.train$age.group, k = k) cm = as.matrix(table(Actual=KNNpred, Predicted = abalone.test$age.group, dnn=list('predicted','actual'))) accuracy <- c(accuracy,sum(diag(cm))/length(abalone.test$age.group)) } plot(ks,accuracy,type = "b") ##################################################################### ######## k-Means ########### ## plot dataset colored by class ggplot(abalone, aes(x = length, y = whole_weight, colour = age.group)) + geom_point() ## set random number generator start value set.seed(123) ## train kmeans abalone.km <- kmeans(abalone[,-8], centers = 3) ## WCSS: total within cluster sum of squares abalone.km$tot.withinss ## get and plot clustering output assigned.clusters <- as.factor(abalone.km$cluster) ggplot(abalone, aes(x = length, y = whole_weight, colour = assigned.clusters)) + geom_point() # ## experimental!!! # labeled.clusters <- as.character(assigned.clusters) # # labeled.clusters[labeled.clusters==1] <- "old" # labeled.clusters[labeled.clusters==2] <- "adult" # labeled.clusters[labeled.clusters==3] <- "young" # # # table(labeled.clusters, abalone$age.group, dnn=list('predicted','actual')) ## run tests with multiple k values and plot WCSS wcss <- c() ks <- c(2,3,4,5) for (k in ks) { abalone.km <- kmeans(abalone[,-8], centers = k) wcss <- c(wss,abalone.km$tot.withinss) } plot(ks,wcss,type = "b") #### END ####