Demo of using kmeans() function in based R. First make up some data with a known structure.
tmp <- c( rnorm(30, -3), rnorm(30, 3) )
tmp
## [1] -2.4819818 -2.0278957 -3.3969565 -1.7111439 -3.0173595 -3.2946183
## [7] -3.4788731 -3.3652391 -4.5337225 -2.2970754 -2.3208173 -2.5311246
## [13] -3.8970497 -2.0075578 -2.4807178 -3.6138686 -2.2144333 -3.8193872
## [19] -1.6697305 -4.9710879 -3.1607138 -3.2451980 -3.4711989 -4.6520797
## [25] -1.0901790 -2.9792390 -0.8238972 -4.1012722 -2.1893384 -3.7181593
## [31] 3.1075968 1.5325468 1.5670340 3.7854320 3.1068203 4.1599367
## [37] 2.9389311 2.0058426 3.4919571 4.5786147 4.1671266 3.8732522
## [43] 0.8172709 1.3115819 3.9518146 3.5018554 4.6593496 3.9609587
## [49] 3.0542598 2.5607706 1.8266326 1.2267209 1.1292253 3.3591157
## [55] 2.9102630 3.4836716 2.1185431 3.3021818 2.5099162 2.8178982
x <- cbind(x=tmp, y=rev(tmp))
plot(x)
Now we have some made up data in x let’s see how kmeans
works with this data
k <- kmeans(x, centers = 2, nstart = 20)
k
## K-means clustering with 2 clusters of sizes 30, 30
##
## Cluster means:
## x y
## 1 -2.952064 2.893904
## 2 2.893904 -2.952064
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 64.01091 64.01091
## (between_SS / total_SS = 88.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Q. How many points are in each cluster?
k$size
## [1] 30 30
Q. How do we get to the cluster membership/assignment.
k$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Q. What about cluster centers?
k$centers
## x y
## 1 -2.952064 2.893904
## 2 2.893904 -2.952064
Now we got to the main results. Let’s use them to plot our data with the kmeans result
plot (x, col=k$cluster)
points(k$centers, col="blue", pch=15)
We will cluster the same data x with the
hclust(). In this case hclust() requires a
distance matrix as input.
hc <- hclust( dist(x) )
hc
##
## Call:
## hclust(d = dist(x))
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 60
Let’s plot our hclust result
plot(hc)
To get our cluster membership vector we need to “cut” the tree with
the cutree()
grps <- cutree(hc, h=8)
grps
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Now plot our data with the hclust() results.
plot(x, col=grps)
Read data from website and try a few visualizations.
url <- "https://tinyurl.com/UK-foods"
x <- read.csv(url, row.names=1)
x
## England Wales Scotland N.Ireland
## Cheese 105 103 103 66
## Carcass_meat 245 227 242 267
## Other_meat 685 803 750 586
## Fish 147 160 122 93
## Fats_and_oils 193 235 184 209
## Sugars 156 175 147 139
## Fresh_potatoes 720 874 566 1033
## Fresh_Veg 253 265 171 143
## Other_Veg 488 570 418 355
## Processed_potatoes 198 203 220 187
## Processed_Veg 360 365 337 334
## Fresh_fruit 1102 1137 957 674
## Cereals 1472 1582 1462 1494
## Beverages 57 73 53 47
## Soft_drinks 1374 1256 1572 1506
## Alcoholic_drinks 375 475 458 135
## Confectionery 54 64 62 41
cols <- rainbow(nrow(x))
barplot( as.matrix(x), col=cols )
barplot( as.matrix(x), col=cols, beside = TRUE )
pairs(x, col=cols)
PCA to the rescue!! The main base R PCA function is called
prcomp() and we will ened to give it the transpose of our
input data!
t(x)
## Cheese Carcass_meat Other_meat Fish Fats_and_oils Sugars
## England 105 245 685 147 193 156
## Wales 103 227 803 160 235 175
## Scotland 103 242 750 122 184 147
## N.Ireland 66 267 586 93 209 139
## Fresh_potatoes Fresh_Veg Other_Veg Processed_potatoes
## England 720 253 488 198
## Wales 874 265 570 203
## Scotland 566 171 418 220
## N.Ireland 1033 143 355 187
## Processed_Veg Fresh_fruit Cereals Beverages Soft_drinks
## England 360 1102 1472 57 1374
## Wales 365 1137 1582 73 1256
## Scotland 337 957 1462 53 1572
## N.Ireland 334 674 1494 47 1506
## Alcoholic_drinks Confectionery
## England 375 54
## Wales 475 64
## Scotland 458 62
## N.Ireland 135 41
pca <- prcomp( t(x) )
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 324.1502 212.7478 73.87622 2.7e-14
## Proportion of Variance 0.6744 0.2905 0.03503 0.0e+00
## Cumulative Proportion 0.6744 0.9650 1.00000 1.0e+00
attributes(pca)
## $names
## [1] "sdev" "rotation" "center" "scale" "x"
##
## $class
## [1] "prcomp"
To make our new PCA plot (a.k.a. PCA score plot) we access
pca$x
plot(pca$x[,1], pca$x[,2])
text(pca$x[,1], pca$x[,2], colnames(x))
Color up the plot
country_cols <- c("orange", "red", "blue", "green")
plot(pca$x[,1], pca$x[,2], xlab="PC1", ylab="PC2")
text(pca$x[,1], pca$x[,2], colnames(x), col=country_cols)
Read in data from website
url2 <- "https://tinyurl.com/expression-CSV"
rna.data <- read.csv(url2, row.names=1)
head(rna.data)
## wt1 wt2 wt3 wt4 wt5 ko1 ko2 ko3 ko4 ko5
## gene1 439 458 408 429 420 90 88 86 90 93
## gene2 219 200 204 210 187 427 423 434 433 426
## gene3 1006 989 1030 1017 973 252 237 238 226 210
## gene4 783 792 829 856 760 849 856 835 885 894
## gene5 181 249 204 244 225 277 305 272 270 279
## gene6 460 502 491 491 493 612 594 577 618 638
pca <- prcomp( t(rna.data) )
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2214.2633 88.9209 84.33908 77.74094 69.66341 67.78516
## Proportion of Variance 0.9917 0.0016 0.00144 0.00122 0.00098 0.00093
## Cumulative Proportion 0.9917 0.9933 0.99471 0.99593 0.99691 0.99784
## PC7 PC8 PC9 PC10
## Standard deviation 65.29428 59.90981 53.20803 2.852e-13
## Proportion of Variance 0.00086 0.00073 0.00057 0.000e+00
## Cumulative Proportion 0.99870 0.99943 1.00000 1.000e+00
Do our PCA plot of this RNA-Seq data
plot(pca$x[,1], pca$x[,2], xlab="PC1", ylab="PC2")
gene_cols <- c("lightpink", "skyblue", "yellow", "purple", "red", "blue", "orange", "lightgreen", "lavender")
plot(pca$x[,1], pca$x[,2], xlab="PC1", ylab="PC2")
text(pca$x[,1], pca$x[,2], colnames(rna.data), col=gene_cols)