 #####################################################
#                                                     #
#  Code for the article:                              #
#                                                     #
#  Application of classification trees for improving  #
#  optical identification of common opaque minerals   #                                                
#                                                     #
#  Journal: Computers & Geosciences                   #
#  Authors: Juan L. Dominguez-Olmedo,                 #
#           Manuel Toscano,                           #
#           Jacinto Mata                              #
#           (University of Huelva, Spain)             #
#                                                     #
#  https://doi.org/10.1016/j.cageo.2020.104480        #
#                                                     #
 #####################################################
#                                                     #
#  Input:  a text file (CSV) with the minerals data   #
#  Output: a text file with the groups created and    #
#          a text file with the decision tree         #
#                                                     #
#  Files are read and written in the directory        #
#  where this program resides                         #
#                                                     #
 #####################################################

# load the package (uncomment next line if C50 is not previously installed)
# install.packages("C50")
library(C50)

# read the data file into a data frame
mineralsData = read.csv("minerals.csv", header=TRUE, sep=",")

# concatenate all the property values for each mineral
allprops = replicate(nrow(mineralsData), "")
for (i in 1:nrow(mineralsData)) {
  for (j in 1:ncol(mineralsData)-1) {
    allprops[i] = paste(allprops[i], colnames(mineralsData)[j], mineralsData[i, j])
  }
}

# group the minerals with the same properties
mgroups = data.frame(table(allprops))
colnames(mgroups)[1] = "PROPERTIES"
colnames(mgroups)[2] = "SIZE"

mgroups$MINERALS = ""
for (i in 1:nrow(mineralsData)) {
  p = match(allprops[i], mgroups$PROPERTIES)
  mgroups$MINERALS[p] = paste(mgroups$MINERALS[p], mineralsData$MINERAL[i])
}
mgroups$MINERALS = substr(mgroups$MINERALS, 2, 999)

# groups with more than one mineral
groups = mgroups$MINERALS[mgroups$SIZE > 1]
groups = groups[order(groups)]

write.table(groups, "minerals-groups.txt", col.names=FALSE, quote=FALSE, sep=" --> ")

# create the target variable
mineralsData$TARGET = as.character(mineralsData$MINERAL)
for (i in 1:nrow(mineralsData)) {
  p = match(allprops[i], mgroups$PROPERTIES)
  if (mgroups$SIZE[p] > 1) {
    mineralsData$TARGET[i] = paste("==[ GROUP", sprintf("%03d", match(mgroups$MINERALS[p], groups)), "]==")
  }
}
mineralsData$TARGET = as.factor(mineralsData$TARGET)

# obtain the decision tree
tree = C5.0(mineralsData$TARGET~ ., data=mineralsData[1:(ncol(mineralsData)-2)], control=C5.0Control(minCases=1, noGlobalPruning=TRUE))
treeInfo = capture.output(summary(tree))

# incorporate the mineral names for each group
for (i in 1:length(groups)) {
  treeInfo = gsub(paste(": ==[ GROUP", sprintf("%03d", i), "]=="), paste(":", groups[i]), treeInfo, fixed = TRUE)
}

# write the decision tree
write(treeInfo, "minerals-tree.txt")
