# Load data dat<-read.csv("islands.csv",header=T) # load libraries library(cluster) library(vegan) library(flashClust) library(maps) library(mapdata) library(clusterCrit) # pick 5000 islands # Here, we will determine how many are in each of the 8 groups, # and randomly select the right proportion from each n_per_grp<-round(table(dat$UPGMAnoAE)*(2500/17883),0) island_numbers<-seq(1:17883) for(g in 1:8) { tmp<-island_numbers[dat$UPGMAnoAE==g] grp_sub<-dat[sample(tmp,n_per_grp[g],replace=FALSE),] if(g==1) { all_grp_sub<-grp_sub } if(g>1) { all_grp_sub<-rbind(all_grp_sub,grp_sub) } } random_islands<-all_grp_sub rm(grp_sub,g,island_numbers,tmp,n_per_grp,all_grp_sub) # weights eig<-c(3.748,1.8,1.037,0.413,0.346,0.318,0.242,0.097) # weighted_variables<-cbind(random_islands$PCAnEAPC1*eig[1]^.5, random_islands$PCAnEAPC2*eig[2]^.5, random_islands$PCAnEAPC3*eig[3]^.5, random_islands$PCAnEAPC4*eig[4]^.5, random_islands$PCAnEAPC5*eig[5]^.5, random_islands$PCAnEAPC6*eig[6]^.5, random_islands$PCAnEAPC7*eig[7]^.5, random_islands$PCnEAPC8*eig[8]^.5) island_cluster<-flashClust(dist(weighted_variables),method="average") grps<-cutree(island_cluster,8) table(random_islands$UPGMAnoAE,grps) grps<-cutree(island_cluster,3) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,4) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,5) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,6) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,7) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,8) intCriteria(weighted_variables,grps,"Calinski_Harabasz") grps<-cutree(island_cluster,8) intCriteria(weighted_variables,grps,"Calinski_Harabasz") cophenetic_dist<-cophenetic(island_cluster) cor(dist(weighted_variables),cophenetic_dist) # PAM clustering island_pam<-pam(weighted_variables,k = 8) table(island_pam$clustering,random_islands$PAMnAE) # cascadeKM caskmeans<-cascadeKM(weighted_variables,2,10,iter=200, criterion="ssi") caskmeans$results plot(caskmeans) # fuzzy clustering fuzz_islands<-fanny(weighted_variables,k=8,memb.exp = 1.4) # some of the differences in classification accuracy are due to rounding error of the data provided # If we follow the methods, and do the PCA ourselves, we can avoid the rounding error # Get raw data from the dataset pca_dat<-cbind(log(dat$SLMP+0.5),log(dat$Dist+1),dat$Prec,dat$Temp,log(dat$CCVT+1),dat$varT,dat$varP,dat$GMMC) # Perform PCA on standardized variables pca_results<-rda(pca_dat, scale=T) # extract eigenvalues eig<-pca_results$CA$eig # get PCA scores for the first 8 PCA axes pca_vars<-scores(pca_results, choices=c(1,2,3,4,5,6,7,8))$sites # plot comparing our first PCA score vs the one given plot(pca_vars[,1],dat$PCAnEAPC1) # they look the same... not identical cor(pca_vars[,1],dat$PCAnEAPC1) # create the weighted matrix new_island_matrix<-cbind(pca_vars[,1]*eig[1]^0.5, pca_vars[,2]*eig[2]^0.5, pca_vars[,3]*eig[3]^0.5, pca_vars[,4]*eig[4]^0.5, pca_vars[,5]*eig[5]^0.5, pca_vars[,6]*eig[6]^0.5, pca_vars[,7]*eig[7]^0.5, pca_vars[,8]*eig[8]^0.5) island_cluster<-flashClust(dist(new_island_matrix),method="average") grps<-cutree(island_cluster,8) # compare classifications table(grps,dat$UPGMAnoAE) intCriteria(new_island_matrix,grps,"Calinski_Harabasz") # plot a dendrogram, cut at level 0.45 to show 8 groups island_cluster_den = as.dendrogram(island_cluster) plot(cut(island_cluster_den, h=0.45)$upper) tree_dist<-cophenetic(island_cluster) cor(tree_dist,dist(new_island_matrix)) # PAM of all data # This will take some time for the full dataset pam_islands<-pam(new_island_matrix,8) table(pam_islands$clustering,dat$PAMnAE) intCriteria(new_islands,pam_test$clustering,"Calinski_Harabasz") # plot islands on a map, exporting to an image file # file will be created in your working directory, not displayed jpeg("islands.jpg",width=2500,height=1250,type="windows") map("worldHires") clus_colors<-rainbow(8) grps<-cutree(island_cluster,8) points(dat$Long,dat$Lat,pch=16,cex=1,col=clus_colors[grps]) dev.off()