Analysis of SEM Mappings of elements

License: Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)

First version 2024-07-19
Current version compiled on {r} format(Sys.time(), '%Y-%m-%d')
Authors
- Agustin Lobo (Agustin.Lobo@geo3bcn.csic.es), reporter.
- Adelina Geyer

1. Import data

SEM element mappings from thin layers of sample DI-17 as grey-level tiffs

library(terra)
library(ggplot2)
library(plotly)
library(gridExtra)
library(RColorBrewer)
library(caret)
library(plyr)
library("dendextend")

imadir <- "../MappingsSEM/Archive 2"
fimas <- list.files(imadir, glob2rx("*_BW.tif"))

r <- rast(file.path(imadir,fimas[1]))[[1]]
for(i in 2:length(fimas)){
  r <- c(r,rast(file.path(imadir,fimas[i]))[[1]])
}
names(r) <- strsplit(names(r),"_BW_1")
r <- flip(r, direction="v")
crs(r) <- "local"
r <- 100*r/255
plot(r,col=grey.colors(256))

plotRGB(r,r=1,g=2,b=3, stretch="lin",main="Al-Ca-Fe as RGB")

writeRaster(r, "DI_17_mappings.tif", filetype="GTiff",overwrite=TRUE)

2. PCA

Note only last PC can be dropped:

r.pca <- princomp(r)
plot(100*cumsum(r.pca$sdev^2)/sum(r.pca$sdev^2),pch=19, ylab="Cummulaitve Variance (%)", xlab="PC")

r.pc <- predict(r, r.pca)
names(r.pc) <- paste("PC-", 1:7)

Display PCs

plot(r.pc,col=grey.colors(256))

Color composites of PCs

PC1-PC2-PC3

plotRGB(r.pc, r=1, g=2, b=3, stretch="lin", main="PC1-PC2-PC3 as RGB")

PC2-PC3-PC4

plotRGB(r.pc, r=2, g=3, b=4, stretch="lin", main="PC2-PC3-PC4 as RGB")

3. Classification

3.1 Hierarchical classification of random sample

We use the first 6 PCs for a hierarchical clustering of a random sample of 10000 pixels. As an initial guess, we cut the dendrogram at h=180, which results into 7 clusters (note one cluster has <0.1% of individuals). We also calculate the centroids (average element composition of each cluster):

Dendrogram

# r.km <- k_means(subset(r.pc, 1:6), centers=5,iter.max=100)
# plot(r.km)
#Random sample:
set.seed(100)
r.rnp <- spatSample(c(r.pc,r), size=10000, as.points=TRUE)
r.pc.hcl <- hclust(dist(values(r.rnp)[,1:6]))
r.pc.dend <- as.dendrogram(r.pc.hcl)
r.pc.hcl180 <- dendextend::cutree(r.pc.dend,h=180,order_clusters_as_data = FALSE)
options(digits=3) #for reasonable print
table(r.pc.hcl180)*100/sum(r.pc.hcl180)

## r.pc.hcl180
##        1        2        3        4        5        6        7 
## 19.11219  0.84710  1.24766  0.00657  0.37758  5.15809  6.08399

options(digits=7)

#Color branches:
# https://stackoverflow.com/questions/18036094/how-to-create-a-dendrogram-with-colored-branches (answ. #12)
# https://github.com/talgalili/dendextend?tab=readme-ov-file#usage
# https://talgalili.github.io/dendextend/articles/dendextend.html
#https://stackoverflow.com/questions/48636522/label-r-dendrogram-branches-with-correct-group-number for class label display

d2=color_branches(r.pc.dend,h=180,col=brewer.pal(8, "Set1")[-6], groupLabels = TRUE) #avoid yellow
#plot(d2, ylim=c(100,270))
plot(d2 %>% set("labels",""))

Centroids

#set.seed(100)
#we have to set 
#we have to reorder order_clusters_as_data = FALSE to get the same cluster names
#as in the above dendrogram plot, but get the same order as in the data (see my protest here: https://github.com/talgalili/dendextend/issues/120)
r.pc.hcl180 <- r.pc.hcl180[order(as.numeric(names(r.pc.hcl180)))]
cents180 <- aggregate(values(r.rnp),by=list(r.pc.hcl180), mean)
names(cents180)[1] <- "hcl180"
#dim(cents180)
options(digits=3) #for reasonable print
print(cents180[,c(1,9:15)])

##   hcl180    Al    Ca    Fe     K     Mg    Na   Ti
## 1      1  50.9  28.1 68.09  43.5  40.32 22.66 62.3
## 2      2  44.9  31.3 73.41  27.2  82.90 22.89 59.4
## 3      3  79.8  41.0 65.17  35.9  47.16 46.42 76.4
## 4      4 100.0 100.0  0.00 100.0 100.00 50.00 50.0
## 5      5  28.1  45.6 19.70  28.7  89.96 19.00 26.8
## 6      6  80.4  29.3 30.68  43.0  24.37 47.93 36.3
## 7      7  10.1   9.0  9.35  12.1   9.45  7.81 10.8

options(digits=7) #default
# initial composition defined from the dendrogram
elem180.lf <- reshape2::melt(cents180[,c(1,9:15)], id.var=1)
elem180.lf$hcl180 <- as.factor(elem180.lf$hcl180)
#head(elem180.lf)
names(elem180.lf) <- c("hcl180", "element", "value")
micolor <- brewer.pal(8, "Set1")[-6] #avoiding yellow
ggcompo1 <- ggplot(data=elem180.lf) +
  geom_line(aes(x=element, y=value, group=hcl180),col="grey") +
  geom_point(aes(x=element, y=value, col=hcl180), size=3) +
  scale_color_manual(values=micolor)
ggcompo1

3.2 Image classification

3.2.1 Hierarchical classification

We use knn with k=1 to apply the hclust classification to the entire image:

#a <- knn3Train(test=values(r.pc)[,1:6], train=values(r.rnp)[,1:6], cl=as.factor(r.pc.hcl180),k=1) #slow, save
#save(a, file="a.rda")
load("a.rda")
r.knn1 <- r[[1]]*0
values(r.knn1) <- a
##writeRaster(r.knn1, "knn1.180", filetype="GTiff")
#rm(a)
#plot(r.knn1,col=brewer.pal(8, "Set1")[-6], main="Hierarchical Classification")

Extending the hclust classification to the entire image in this way has the danger that some elements (pixels) could actually be not similar enough to any of the proposed classes. Therefore, we seek for a maximum distance threshold (or minimum similarity threshold) based on the stratified statistics of the sample used for hclust (). Elements (pixels) beyond the threshold will be left as unclassified.

# We start by computing the euclidean distance between each pixel and all the classes
d <- data.frame(matrix(0, nrow=length(r.rnp), ncol=6))
for (i in 1:nrow(cents180)) {
  d[,i] <-  apply(values(r.rnp)[,1:6],1,function(x) sqrt(sum((x-unlist(cents180[i,2:7]))^2)))
}
#dim(d)

#now we keep the distance to the class to which the given point has been classified
d$hcl180 <- r.pc.hcl180
dcl <- (1:nrow(d))*0
for(i in 1:nrow(d)){
  dcl[i] <- d[i,as.numeric(d$hcl180)[i]]
}
dcl <- data.frame(hcl180=d$hcl180, d=dcl)
#head(dcl)
#d is the distance from that point to the assigned class

#Now we summarize:
dclsumm <- ddply(dcl,"hcl180", summarise, 
      dmin = mean(d), 
      dmean=mean(d),
      dmedian=median(d),
      d99=quantile(d, probs=0.99),
      dmax=max(d))

This table presents the summary statistics of the euclidean distance between each element in the hclust sample and the centroid of its assigned class:

options(digits=3) #for reasonable print
dclsumm
options(digits=7) #default

##   hcl180 dmin dmean dmedian   d99  dmax
## 1      1 44.8  44.8    44.2  78.9  93.1
## 2      2 39.4  39.4    36.9 124.8 129.3
## 3      3 38.4  38.4    35.1 112.9 112.9
## 4      4 63.9  63.9    63.9  63.9  63.9
## 5      5 46.8  46.8    45.5 100.9 101.0
## 6      6 47.0  47.0    45.5  78.9  87.8
## 7      7 29.8  29.8    25.8  92.1 127.7

We can select either the maximum or the more severe 99th percentile as the threshold beyond which any element will be left as unclassified. The reasoning behind this decision is that we cannot extend our classification to elements that are different to those used to calculate the centroids.

u <- max(dclsumm$hcl180) +1

# We need to calculate the euclidean distance between each pixel in the image and the centroids
#First, distances to each class centroid:
dr <- r*0
dr <- subset(dr, 1:nrow(cents180))
names(dr) <- cents180$hcl180
for (i in 1:nrow(cents180)) {
  dr[[i]] <- sqrt(sum((subset(r.pc,1:6)-unlist(cents180[i,2:7]))^2))
}
names(dr) <- cents180$hcl180
#plot(dr, col=grey.colors(256))
#writeRaster(dr, file="d.tif", filetype="GTiff", overwrite=TRUE)

#We select the distance to the assigned class only:
mtot <- r[[1]]*0
for (i in 1:nrow(cents180)) {
  #print(i)
  m <- ifel(r.knn1==i, 1, 0)
  m <- m*dr[[i]]
  mtot <- mtot + m
}
#plot(mtot, col= grey.colors(256), main="Distance from each pixel to its class centroid")

We set a given pixel as unclassified if this distance is beyond the threshold for its closest centroid. Using the maximum observed distance for each class in the sample, we find that only ~0.1% pixels in the image were left as unclassified (class 8), while using the maximum observed distance for each class in the sample, ~1% resulted unclassified (class 8).

#using the maximum observed distance for each class
u <- max(dclsumm$hcl180) +1
r.knn1b <- r.knn1
for (i in dclsumm$hcl180) {
  values(r.knn1b)[values(r.knn1==i) & values(mtot) > dclsumm$dmax[i]] <- u
}
f <- freq(r.knn1b)
f$percent <- f$count*100/sum(f$count)
#options(digits=3) #for reasonable print
#f[,-1]
#options(digits=7)
#using the 99 percentile as threshold
u <- max(dclsumm$hcl180) +1
r.knn1b <- r.knn1
for (i in dclsumm$hcl180) {
  values(r.knn1b)[values(r.knn1==i) & values(mtot) > dclsumm$d99[i]] <- u
}
f <- freq(r.knn1b)
f$percent <- f$count*100/sum(f$count)
options(digits=3) #for reasonable print
f[,-1]
options(digits=7)

##   value   count  percent
## 1     1 1105710 5.76e+01
## 2     2   50048 2.61e+00
## 3     3   71834 3.74e+00
## 4     4      13 6.77e-04
## 5     5   20848 1.09e+00
## 6     6  290979 1.52e+01
## 7     7  361176 1.88e+01
## 8     8   19392 1.01e+00

Image

micolor2 <- c(micolor, "black")
plot(r.knn1b,col=micolor2, main="Hierarchical Classification ")

Centroids

# initial composition defined from the dendrogram but extending the classification to the entire image by NN 
elem180knn1 <- zonal(r,r.knn1b)
elem180knn1.lf <- reshape2::melt(elem180knn1, id.var=1)
#head(elem180km.lf)
names(elem180knn1.lf) <- c("hcl180", "element", "value")
ggcompo1b <- ggplot(data=elem180knn1.lf) +
  geom_line(aes(x=element, y=value, group=hcl180),col="grey") +
  geom_point(aes(x=element, y=value, col=as.factor(hcl180)), size=3) +
  #scale_color_brewer(palette="Set1")
  scale_color_manual(values=micolor2)
ggcompo1b

Dendrogram

plot(d2 %>% set("labels",""))

3.2.2 Initialized k-means classification

We use initialized k_means with iter.max=100 to let the algorithm converge to a more stable classification

set.seed(100)
r.km180.100 <- k_means(subset(r.pc, 1:6), centers=cents180[,2:7],iter.max=100)
#using trace=TRUE I have found that it performs 3 iterations only
#plot(r.km180.100,col=brewer.pal(6, "Set1"), main="Initialized K-means classification")
#Element composition of classes after k-means convergence
elem180km <- zonal(r,r.km180.100)
elem180km.lf <- reshape2::melt(elem180km, id.var=1)
#head(elem180km.lf)
names(elem180km.lf) <- c("hcl180", "element", "value")
ggcompo2 <- ggplot(data=elem180km.lf) +
  geom_line(aes(x=element, y=value, group=hcl180),col="grey") +
  geom_point(aes(x=element, y=value, col=as.factor(hcl180)), size=3) +
  #scale_color_brewer(palette="Set1")
  scale_color_manual(values=micolor)

Image

plot(r.km180.100,col=brewer.pal(7, "Set1")[-6], main="Initialized K-means classification")

Centroids

ggcompo2

The kmeans classification improves over the hclust-knn1 classification in terms of image structure, which suggests that the arbitrary initial cut at h=180 was not a good choice. There is room for improvement there and/or apply other non-supervised methods.

3.2.3 Supervised Classification

TBD using the training objects digitized by Adelina. Requires aligning mappings to SEM images.

use rpart for classification (requires training, see https://rspatial.org/rs/5-supclassification.html)
use random forest, maxlik… https://bleutner.github.io/RStoolbox/reference/superClass.html
Allign mappings to SEM

4. Other TBD

Brief SEM image and sample description
–More mappings–
SEM-object identification supported with mappings